FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vp9.c
Go to the documentation of this file.
1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "avcodec.h"
25 #include "get_bits.h"
26 #include "internal.h"
27 #include "profiles.h"
28 #include "thread.h"
29 #include "videodsp.h"
30 #include "vp56.h"
31 #include "vp9.h"
32 #include "vp9data.h"
33 #include "vp9dsp.h"
34 #include "libavutil/avassert.h"
35 #include "libavutil/pixdesc.h"
36 
37 #define VP9_SYNCCODE 0x498342
38 
39 struct VP9Filter {
40  uint8_t level[8 * 8];
41  uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
42  [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
43 };
44 
45 typedef struct VP9Block {
48  VP56mv mv[4 /* b_idx */][2 /* ref */];
49  enum BlockSize bs;
50  enum TxfmMode tx, uvtx;
51  enum BlockLevel bl;
53 } VP9Block;
54 
55 typedef struct VP9Context {
57 
63  unsigned c_b_size;
65  int pass;
66  int row, row7, col, col7;
67  uint8_t *dst[3];
68  ptrdiff_t y_stride, uv_stride;
69 
73  // sb_cols/rows, rows/cols and last_fmt are used for allocating all internal
74  // arrays, and are thus per-thread. w/h and gf_fmt are synced between threads
75  // and are therefore per-stream. pix_fmt represents the value in the header
76  // of the currently processed frame.
77  int w, h;
78  enum AVPixelFormat pix_fmt, last_fmt, gf_fmt;
79  unsigned sb_cols, sb_rows, rows, cols;
81 
82  struct {
85  } filter_lut;
87  struct {
89  uint8_t coef[4][2][2][6][6][3];
90  } prob_ctx[4];
91  struct {
93  uint8_t coef[4][2][2][6][6][11];
94  } prob;
95  struct {
96  unsigned y_mode[4][10];
97  unsigned uv_mode[10][10];
98  unsigned filter[4][3];
99  unsigned mv_mode[7][4];
100  unsigned intra[4][2];
101  unsigned comp[5][2];
102  unsigned single_ref[5][2][2];
103  unsigned comp_ref[5][2];
104  unsigned tx32p[2][4];
105  unsigned tx16p[2][3];
106  unsigned tx8p[2][2];
107  unsigned skip[3][2];
108  unsigned mv_joint[4];
109  struct {
110  unsigned sign[2];
111  unsigned classes[11];
112  unsigned class0[2];
113  unsigned bits[10][2];
114  unsigned class0_fp[2][4];
115  unsigned fp[4];
116  unsigned class0_hp[2];
117  unsigned hp[2];
118  } mv_comp[2];
119  unsigned partition[4][4][4];
120  unsigned coef[4][2][2][6][6][3];
121  unsigned eob[4][2][2][6][6][2];
122  } counts;
123 
124  // contextual (left/above) cache
139  // FIXME maybe merge some of the below in a flags field?
150 
151  // whole-frame cache
153  struct VP9Filter *lflvl;
155 
156  // block reconstruction intermediates
158  int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
160  struct { int x, y; } min_mv, max_mv;
161  DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
162  DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
163  uint16_t mvscale[3][2];
165 } VP9Context;
166 
167 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
168  {
169  { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
170  { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
171  }, {
172  { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
173  { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
174  }
175 };
176 
178 {
179  ff_thread_release_buffer(ctx, &f->tf);
182  f->segmentation_map = NULL;
184 }
185 
187 {
188  VP9Context *s = ctx->priv_data;
189  int ret, sz;
190 
191  if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
192  return ret;
193  sz = 64 * s->sb_cols * s->sb_rows;
194  if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
195  goto fail;
196  }
197 
199  f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
200 
201  if (ctx->hwaccel) {
202  const AVHWAccel *hwaccel = ctx->hwaccel;
204  if (hwaccel->frame_priv_data_size) {
206  if (!f->hwaccel_priv_buf)
207  goto fail;
209  }
210  }
211 
212  return 0;
213 
214 fail:
215  vp9_unref_frame(ctx, f);
216  return AVERROR(ENOMEM);
217 }
218 
220 {
221  int res;
222 
223  if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
224  return res;
225  } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
226  goto fail;
227  }
228 
230  dst->mv = src->mv;
231  dst->uses_2pass = src->uses_2pass;
232 
233  if (src->hwaccel_picture_private) {
235  if (!dst->hwaccel_priv_buf)
236  goto fail;
238  }
239 
240  return 0;
241 
242 fail:
243  vp9_unref_frame(ctx, dst);
244  return AVERROR(ENOMEM);
245 }
246 
247 static int update_size(AVCodecContext *ctx, int w, int h)
248 {
249 #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL + CONFIG_VP9_VAAPI_HWACCEL)
250  enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
251  VP9Context *s = ctx->priv_data;
252  uint8_t *p;
253  int bytesperpixel = s->bytesperpixel, res, cols, rows;
254 
255  av_assert0(w > 0 && h > 0);
256 
257  if (!(s->pix_fmt == s->gf_fmt && w == s->w && h == s->h)) {
258  if ((res = ff_set_dimensions(ctx, w, h)) < 0)
259  return res;
260 
261  switch (s->pix_fmt) {
262  case AV_PIX_FMT_YUV420P:
263 #if CONFIG_VP9_DXVA2_HWACCEL
264  *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
265 #endif
266 #if CONFIG_VP9_D3D11VA_HWACCEL
267  *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
268 #endif
269 #if CONFIG_VP9_VAAPI_HWACCEL
270  *fmtp++ = AV_PIX_FMT_VAAPI;
271 #endif
272  break;
275 #if CONFIG_VP9_VAAPI_HWACCEL
276  *fmtp++ = AV_PIX_FMT_VAAPI;
277 #endif
278  break;
279  }
280 
281  *fmtp++ = s->pix_fmt;
282  *fmtp = AV_PIX_FMT_NONE;
283 
284  res = ff_thread_get_format(ctx, pix_fmts);
285  if (res < 0)
286  return res;
287 
288  ctx->pix_fmt = res;
289  s->gf_fmt = s->pix_fmt;
290  s->w = w;
291  s->h = h;
292  }
293 
294  cols = (w + 7) >> 3;
295  rows = (h + 7) >> 3;
296 
297  if (s->intra_pred_data[0] && cols == s->cols && rows == s->rows && s->pix_fmt == s->last_fmt)
298  return 0;
299 
300  s->last_fmt = s->pix_fmt;
301  s->sb_cols = (w + 63) >> 6;
302  s->sb_rows = (h + 63) >> 6;
303  s->cols = (w + 7) >> 3;
304  s->rows = (h + 7) >> 3;
305 
306 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
307  av_freep(&s->intra_pred_data[0]);
308  // FIXME we slightly over-allocate here for subsampled chroma, but a little
309  // bit of padding shouldn't affect performance...
310  p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
311  sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
312  if (!p)
313  return AVERROR(ENOMEM);
314  assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
315  assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel);
316  assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel);
317  assign(s->above_y_nnz_ctx, uint8_t *, 16);
318  assign(s->above_mode_ctx, uint8_t *, 16);
319  assign(s->above_mv_ctx, VP56mv(*)[2], 16);
320  assign(s->above_uv_nnz_ctx[0], uint8_t *, 16);
321  assign(s->above_uv_nnz_ctx[1], uint8_t *, 16);
323  assign(s->above_skip_ctx, uint8_t *, 8);
324  assign(s->above_txfm_ctx, uint8_t *, 8);
325  assign(s->above_segpred_ctx, uint8_t *, 8);
326  assign(s->above_intra_ctx, uint8_t *, 8);
327  assign(s->above_comp_ctx, uint8_t *, 8);
328  assign(s->above_ref_ctx, uint8_t *, 8);
329  assign(s->above_filter_ctx, uint8_t *, 8);
330  assign(s->lflvl, struct VP9Filter *, 1);
331 #undef assign
332 
333  // these will be re-allocated a little later
334  av_freep(&s->b_base);
335  av_freep(&s->block_base);
336 
337  if (s->s.h.bpp != s->last_bpp) {
339  ff_videodsp_init(&s->vdsp, s->s.h.bpp);
340  s->last_bpp = s->s.h.bpp;
341  }
342 
343  return 0;
344 }
345 
347 {
348  VP9Context *s = ctx->priv_data;
349  int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
350 
352  return 0;
353 
354  av_free(s->b_base);
355  av_free(s->block_base);
356  chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
357  chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
358  if (s->s.frames[CUR_FRAME].uses_2pass) {
359  int sbs = s->sb_cols * s->sb_rows;
360 
361  s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
362  s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
363  16 * 16 + 2 * chroma_eobs) * sbs);
364  if (!s->b_base || !s->block_base)
365  return AVERROR(ENOMEM);
366  s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
367  s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
368  s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
369  s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
370  s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
371  } else {
372  s->b_base = av_malloc(sizeof(VP9Block));
373  s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
374  16 * 16 + 2 * chroma_eobs);
375  if (!s->b_base || !s->block_base)
376  return AVERROR(ENOMEM);
377  s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
378  s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
379  s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
380  s->uveob_base[0] = s->eob_base + 16 * 16;
381  s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
382  }
384 
385  return 0;
386 }
387 
388 // for some reason the sign bit is at the end, not the start, of a bit sequence
390 {
391  int v = get_bits(gb, n);
392  return get_bits1(gb) ? -v : v;
393 }
394 
395 static av_always_inline int inv_recenter_nonneg(int v, int m)
396 {
397  return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
398 }
399 
400 // differential forward probability updates
401 static int update_prob(VP56RangeCoder *c, int p)
402 {
403  static const int inv_map_table[255] = {
404  7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
405  189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
406  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
407  25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
408  40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
409  55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
410  70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
411  86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
412  101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
413  116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
414  131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
415  146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
416  161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
417  177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
418  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
419  207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
420  222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
421  237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
422  252, 253, 253,
423  };
424  int d;
425 
426  /* This code is trying to do a differential probability update. For a
427  * current probability A in the range [1, 255], the difference to a new
428  * probability of any value can be expressed differentially as 1-A,255-A
429  * where some part of this (absolute range) exists both in positive as
430  * well as the negative part, whereas another part only exists in one
431  * half. We're trying to code this shared part differentially, i.e.
432  * times two where the value of the lowest bit specifies the sign, and
433  * the single part is then coded on top of this. This absolute difference
434  * then again has a value of [0,254], but a bigger value in this range
435  * indicates that we're further away from the original value A, so we
436  * can code this as a VLC code, since higher values are increasingly
437  * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
438  * updates vs. the 'fine, exact' updates further down the range, which
439  * adds one extra dimension to this differential update model. */
440 
441  if (!vp8_rac_get(c)) {
442  d = vp8_rac_get_uint(c, 4) + 0;
443  } else if (!vp8_rac_get(c)) {
444  d = vp8_rac_get_uint(c, 4) + 16;
445  } else if (!vp8_rac_get(c)) {
446  d = vp8_rac_get_uint(c, 5) + 32;
447  } else {
448  d = vp8_rac_get_uint(c, 7);
449  if (d >= 65)
450  d = (d << 1) - 65 + vp8_rac_get(c);
451  d += 64;
452  av_assert2(d < FF_ARRAY_ELEMS(inv_map_table));
453  }
454 
455  return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
456  255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
457 }
458 
460 {
461  static const enum AVColorSpace colorspaces[8] = {
464  };
465  VP9Context *s = ctx->priv_data;
466  int bits = ctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12
467 
468  s->bpp_index = bits;
469  s->s.h.bpp = 8 + bits * 2;
470  s->bytesperpixel = (7 + s->s.h.bpp) >> 3;
471  ctx->colorspace = colorspaces[get_bits(&s->gb, 3)];
472  if (ctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1
473  static const enum AVPixelFormat pix_fmt_rgb[3] = {
475  };
476  s->ss_h = s->ss_v = 0;
478  s->pix_fmt = pix_fmt_rgb[bits];
479  if (ctx->profile & 1) {
480  if (get_bits1(&s->gb)) {
481  av_log(ctx, AV_LOG_ERROR, "Reserved bit set in RGB\n");
482  return AVERROR_INVALIDDATA;
483  }
484  } else {
485  av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile %d\n",
486  ctx->profile);
487  return AVERROR_INVALIDDATA;
488  }
489  } else {
490  static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = {
497  };
499  if (ctx->profile & 1) {
500  s->ss_h = get_bits1(&s->gb);
501  s->ss_v = get_bits1(&s->gb);
502  s->pix_fmt = pix_fmt_for_ss[bits][s->ss_v][s->ss_h];
503  if (s->pix_fmt == AV_PIX_FMT_YUV420P) {
504  av_log(ctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n",
505  ctx->profile);
506  return AVERROR_INVALIDDATA;
507  } else if (get_bits1(&s->gb)) {
508  av_log(ctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n",
509  ctx->profile);
510  return AVERROR_INVALIDDATA;
511  }
512  } else {
513  s->ss_h = s->ss_v = 1;
514  s->pix_fmt = pix_fmt_for_ss[bits][1][1];
515  }
516  }
517 
518  return 0;
519 }
520 
522  const uint8_t *data, int size, int *ref)
523 {
524  VP9Context *s = ctx->priv_data;
525  int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
526  int last_invisible;
527  const uint8_t *data2;
528 
529  /* general header */
530  if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
531  av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
532  return res;
533  }
534  if (get_bits(&s->gb, 2) != 0x2) { // frame marker
535  av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
536  return AVERROR_INVALIDDATA;
537  }
538  ctx->profile = get_bits1(&s->gb);
539  ctx->profile |= get_bits1(&s->gb) << 1;
540  if (ctx->profile == 3) ctx->profile += get_bits1(&s->gb);
541  if (ctx->profile > 3) {
542  av_log(ctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", ctx->profile);
543  return AVERROR_INVALIDDATA;
544  }
545  s->s.h.profile = ctx->profile;
546  if (get_bits1(&s->gb)) {
547  *ref = get_bits(&s->gb, 3);
548  return 0;
549  }
550  s->last_keyframe = s->s.h.keyframe;
551  s->s.h.keyframe = !get_bits1(&s->gb);
552  last_invisible = s->s.h.invisible;
553  s->s.h.invisible = !get_bits1(&s->gb);
554  s->s.h.errorres = get_bits1(&s->gb);
555  s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible;
556  if (s->s.h.keyframe) {
557  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
558  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
559  return AVERROR_INVALIDDATA;
560  }
561  if ((res = read_colorspace_details(ctx)) < 0)
562  return res;
563  // for profile 1, here follows the subsampling bits
564  s->s.h.refreshrefmask = 0xff;
565  w = get_bits(&s->gb, 16) + 1;
566  h = get_bits(&s->gb, 16) + 1;
567  if (get_bits1(&s->gb)) // display size
568  skip_bits(&s->gb, 32);
569  } else {
570  s->s.h.intraonly = s->s.h.invisible ? get_bits1(&s->gb) : 0;
571  s->s.h.resetctx = s->s.h.errorres ? 0 : get_bits(&s->gb, 2);
572  if (s->s.h.intraonly) {
573  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
574  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
575  return AVERROR_INVALIDDATA;
576  }
577  if (ctx->profile >= 1) {
578  if ((res = read_colorspace_details(ctx)) < 0)
579  return res;
580  } else {
581  s->ss_h = s->ss_v = 1;
582  s->s.h.bpp = 8;
583  s->bpp_index = 0;
584  s->bytesperpixel = 1;
585  s->pix_fmt = AV_PIX_FMT_YUV420P;
588  }
589  s->s.h.refreshrefmask = get_bits(&s->gb, 8);
590  w = get_bits(&s->gb, 16) + 1;
591  h = get_bits(&s->gb, 16) + 1;
592  if (get_bits1(&s->gb)) // display size
593  skip_bits(&s->gb, 32);
594  } else {
595  s->s.h.refreshrefmask = get_bits(&s->gb, 8);
596  s->s.h.refidx[0] = get_bits(&s->gb, 3);
597  s->s.h.signbias[0] = get_bits1(&s->gb) && !s->s.h.errorres;
598  s->s.h.refidx[1] = get_bits(&s->gb, 3);
599  s->s.h.signbias[1] = get_bits1(&s->gb) && !s->s.h.errorres;
600  s->s.h.refidx[2] = get_bits(&s->gb, 3);
601  s->s.h.signbias[2] = get_bits1(&s->gb) && !s->s.h.errorres;
602  if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] ||
603  !s->s.refs[s->s.h.refidx[1]].f->buf[0] ||
604  !s->s.refs[s->s.h.refidx[2]].f->buf[0]) {
605  av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
606  return AVERROR_INVALIDDATA;
607  }
608  if (get_bits1(&s->gb)) {
609  w = s->s.refs[s->s.h.refidx[0]].f->width;
610  h = s->s.refs[s->s.h.refidx[0]].f->height;
611  } else if (get_bits1(&s->gb)) {
612  w = s->s.refs[s->s.h.refidx[1]].f->width;
613  h = s->s.refs[s->s.h.refidx[1]].f->height;
614  } else if (get_bits1(&s->gb)) {
615  w = s->s.refs[s->s.h.refidx[2]].f->width;
616  h = s->s.refs[s->s.h.refidx[2]].f->height;
617  } else {
618  w = get_bits(&s->gb, 16) + 1;
619  h = get_bits(&s->gb, 16) + 1;
620  }
621  // Note that in this code, "CUR_FRAME" is actually before we
622  // have formally allocated a frame, and thus actually represents
623  // the _last_ frame
624  s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w &&
625  s->s.frames[CUR_FRAME].tf.f->height == h;
626  if (get_bits1(&s->gb)) // display size
627  skip_bits(&s->gb, 32);
628  s->s.h.highprecisionmvs = get_bits1(&s->gb);
630  get_bits(&s->gb, 2);
631  s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] ||
632  s->s.h.signbias[0] != s->s.h.signbias[2];
633  if (s->s.h.allowcompinter) {
634  if (s->s.h.signbias[0] == s->s.h.signbias[1]) {
635  s->s.h.fixcompref = 2;
636  s->s.h.varcompref[0] = 0;
637  s->s.h.varcompref[1] = 1;
638  } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) {
639  s->s.h.fixcompref = 1;
640  s->s.h.varcompref[0] = 0;
641  s->s.h.varcompref[1] = 2;
642  } else {
643  s->s.h.fixcompref = 0;
644  s->s.h.varcompref[0] = 1;
645  s->s.h.varcompref[1] = 2;
646  }
647  }
648  }
649  }
650  s->s.h.refreshctx = s->s.h.errorres ? 0 : get_bits1(&s->gb);
651  s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb);
652  s->s.h.framectxid = c = get_bits(&s->gb, 2);
653  if (s->s.h.keyframe || s->s.h.intraonly)
654  s->s.h.framectxid = 0; // BUG: libvpx ignores this field in keyframes
655 
656  /* loopfilter header data */
657  if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) {
658  // reset loopfilter defaults
659  s->s.h.lf_delta.ref[0] = 1;
660  s->s.h.lf_delta.ref[1] = 0;
661  s->s.h.lf_delta.ref[2] = -1;
662  s->s.h.lf_delta.ref[3] = -1;
663  s->s.h.lf_delta.mode[0] = 0;
664  s->s.h.lf_delta.mode[1] = 0;
665  memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat));
666  }
667  s->s.h.filter.level = get_bits(&s->gb, 6);
668  sharp = get_bits(&s->gb, 3);
669  // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
670  // the old cache values since they are still valid
671  if (s->s.h.filter.sharpness != sharp)
672  memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
673  s->s.h.filter.sharpness = sharp;
674  if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
675  if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
676  for (i = 0; i < 4; i++)
677  if (get_bits1(&s->gb))
678  s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
679  for (i = 0; i < 2; i++)
680  if (get_bits1(&s->gb))
681  s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
682  }
683  }
684 
685  /* quantization header data */
686  s->s.h.yac_qi = get_bits(&s->gb, 8);
687  s->s.h.ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
688  s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
689  s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
690  s->s.h.lossless = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 &&
691  s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0;
692  if (s->s.h.lossless)
694 
695  /* segmentation header info */
696  if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) {
697  if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) {
698  for (i = 0; i < 7; i++)
699  s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ?
700  get_bits(&s->gb, 8) : 255;
701  if ((s->s.h.segmentation.temporal = get_bits1(&s->gb))) {
702  for (i = 0; i < 3; i++)
703  s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ?
704  get_bits(&s->gb, 8) : 255;
705  }
706  }
707 
708  if (get_bits1(&s->gb)) {
710  for (i = 0; i < 8; i++) {
711  if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
712  s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
713  if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
714  s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
715  if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
716  s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
717  s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
718  }
719  }
720  }
721 
722  // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
723  for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) {
724  int qyac, qydc, quvac, quvdc, lflvl, sh;
725 
726  if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) {
727  if (s->s.h.segmentation.absolute_vals)
728  qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8);
729  else
730  qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8);
731  } else {
732  qyac = s->s.h.yac_qi;
733  }
734  qydc = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8);
735  quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8);
736  quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8);
737  qyac = av_clip_uintp2(qyac, 8);
738 
739  s->s.h.segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[s->bpp_index][qydc];
740  s->s.h.segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[s->bpp_index][qyac];
741  s->s.h.segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[s->bpp_index][quvdc];
742  s->s.h.segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[s->bpp_index][quvac];
743 
744  sh = s->s.h.filter.level >= 32;
745  if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) {
746  if (s->s.h.segmentation.absolute_vals)
747  lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6);
748  else
749  lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6);
750  } else {
751  lflvl = s->s.h.filter.level;
752  }
753  if (s->s.h.lf_delta.enabled) {
754  s->s.h.segmentation.feat[i].lflvl[0][0] =
755  s->s.h.segmentation.feat[i].lflvl[0][1] =
756  av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] * (1 << sh)), 6);
757  for (j = 1; j < 4; j++) {
758  s->s.h.segmentation.feat[i].lflvl[j][0] =
759  av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
760  s->s.h.lf_delta.mode[0]) * (1 << sh)), 6);
761  s->s.h.segmentation.feat[i].lflvl[j][1] =
762  av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] +
763  s->s.h.lf_delta.mode[1]) * (1 << sh)), 6);
764  }
765  } else {
766  memset(s->s.h.segmentation.feat[i].lflvl, lflvl,
767  sizeof(s->s.h.segmentation.feat[i].lflvl));
768  }
769  }
770 
771  /* tiling info */
772  if ((res = update_size(ctx, w, h)) < 0) {
773  av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n",
774  w, h, s->pix_fmt);
775  return res;
776  }
777  for (s->s.h.tiling.log2_tile_cols = 0;
778  s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols);
779  s->s.h.tiling.log2_tile_cols++) ;
780  for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
781  max = FFMAX(0, max - 1);
782  while (max > s->s.h.tiling.log2_tile_cols) {
783  if (get_bits1(&s->gb))
784  s->s.h.tiling.log2_tile_cols++;
785  else
786  break;
787  }
788  s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
789  s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
790  if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
791  s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
792  s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
793  sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
794  if (!s->c_b) {
795  av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
796  return AVERROR(ENOMEM);
797  }
798  }
799 
800  /* check reference frames */
801  if (!s->s.h.keyframe && !s->s.h.intraonly) {
802  for (i = 0; i < 3; i++) {
803  AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f;
804  int refw = ref->width, refh = ref->height;
805 
806  if (ref->format != ctx->pix_fmt) {
807  av_log(ctx, AV_LOG_ERROR,
808  "Ref pixfmt (%s) did not match current frame (%s)",
811  return AVERROR_INVALIDDATA;
812  } else if (refw == w && refh == h) {
813  s->mvscale[i][0] = s->mvscale[i][1] = 0;
814  } else {
815  if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
816  av_log(ctx, AV_LOG_ERROR,
817  "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
818  refw, refh, w, h);
819  return AVERROR_INVALIDDATA;
820  }
821  s->mvscale[i][0] = (refw << 14) / w;
822  s->mvscale[i][1] = (refh << 14) / h;
823  s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
824  s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
825  }
826  }
827  }
828 
829  if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
830  s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
831  s->prob_ctx[3].p = vp9_default_probs;
832  memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
833  sizeof(vp9_default_coef_probs));
834  memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
835  sizeof(vp9_default_coef_probs));
836  memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
837  sizeof(vp9_default_coef_probs));
838  memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
839  sizeof(vp9_default_coef_probs));
840  } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
842  memcpy(s->prob_ctx[c].coef, vp9_default_coef_probs,
843  sizeof(vp9_default_coef_probs));
844  }
845 
846  // next 16 bits is size of the rest of the header (arith-coded)
847  s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16);
848  s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8;
849 
850  data2 = align_get_bits(&s->gb);
851  if (size2 > size - (data2 - data)) {
852  av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
853  return AVERROR_INVALIDDATA;
854  }
855  ff_vp56_init_range_decoder(&s->c, data2, size2);
856  if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
857  av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
858  return AVERROR_INVALIDDATA;
859  }
860 
861  if (s->s.h.keyframe || s->s.h.intraonly) {
862  memset(s->counts.coef, 0, sizeof(s->counts.coef));
863  memset(s->counts.eob, 0, sizeof(s->counts.eob));
864  } else {
865  memset(&s->counts, 0, sizeof(s->counts));
866  }
867  // FIXME is it faster to not copy here, but do it down in the fw updates
868  // as explicit copies if the fw update is missing (and skip the copy upon
869  // fw update)?
870  s->prob.p = s->prob_ctx[c].p;
871 
872  // txfm updates
873  if (s->s.h.lossless) {
874  s->s.h.txfmmode = TX_4X4;
875  } else {
876  s->s.h.txfmmode = vp8_rac_get_uint(&s->c, 2);
877  if (s->s.h.txfmmode == 3)
878  s->s.h.txfmmode += vp8_rac_get(&s->c);
879 
880  if (s->s.h.txfmmode == TX_SWITCHABLE) {
881  for (i = 0; i < 2; i++)
882  if (vp56_rac_get_prob_branchy(&s->c, 252))
883  s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
884  for (i = 0; i < 2; i++)
885  for (j = 0; j < 2; j++)
886  if (vp56_rac_get_prob_branchy(&s->c, 252))
887  s->prob.p.tx16p[i][j] =
888  update_prob(&s->c, s->prob.p.tx16p[i][j]);
889  for (i = 0; i < 2; i++)
890  for (j = 0; j < 3; j++)
891  if (vp56_rac_get_prob_branchy(&s->c, 252))
892  s->prob.p.tx32p[i][j] =
893  update_prob(&s->c, s->prob.p.tx32p[i][j]);
894  }
895  }
896 
897  // coef updates
898  for (i = 0; i < 4; i++) {
899  uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
900  if (vp8_rac_get(&s->c)) {
901  for (j = 0; j < 2; j++)
902  for (k = 0; k < 2; k++)
903  for (l = 0; l < 6; l++)
904  for (m = 0; m < 6; m++) {
905  uint8_t *p = s->prob.coef[i][j][k][l][m];
906  uint8_t *r = ref[j][k][l][m];
907  if (m >= 3 && l == 0) // dc only has 3 pt
908  break;
909  for (n = 0; n < 3; n++) {
910  if (vp56_rac_get_prob_branchy(&s->c, 252)) {
911  p[n] = update_prob(&s->c, r[n]);
912  } else {
913  p[n] = r[n];
914  }
915  }
916  p[3] = 0;
917  }
918  } else {
919  for (j = 0; j < 2; j++)
920  for (k = 0; k < 2; k++)
921  for (l = 0; l < 6; l++)
922  for (m = 0; m < 6; m++) {
923  uint8_t *p = s->prob.coef[i][j][k][l][m];
924  uint8_t *r = ref[j][k][l][m];
925  if (m > 3 && l == 0) // dc only has 3 pt
926  break;
927  memcpy(p, r, 3);
928  p[3] = 0;
929  }
930  }
931  if (s->s.h.txfmmode == i)
932  break;
933  }
934 
935  // mode updates
936  for (i = 0; i < 3; i++)
937  if (vp56_rac_get_prob_branchy(&s->c, 252))
938  s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
939  if (!s->s.h.keyframe && !s->s.h.intraonly) {
940  for (i = 0; i < 7; i++)
941  for (j = 0; j < 3; j++)
942  if (vp56_rac_get_prob_branchy(&s->c, 252))
943  s->prob.p.mv_mode[i][j] =
944  update_prob(&s->c, s->prob.p.mv_mode[i][j]);
945 
946  if (s->s.h.filtermode == FILTER_SWITCHABLE)
947  for (i = 0; i < 4; i++)
948  for (j = 0; j < 2; j++)
949  if (vp56_rac_get_prob_branchy(&s->c, 252))
950  s->prob.p.filter[i][j] =
951  update_prob(&s->c, s->prob.p.filter[i][j]);
952 
953  for (i = 0; i < 4; i++)
954  if (vp56_rac_get_prob_branchy(&s->c, 252))
955  s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
956 
957  if (s->s.h.allowcompinter) {
958  s->s.h.comppredmode = vp8_rac_get(&s->c);
959  if (s->s.h.comppredmode)
960  s->s.h.comppredmode += vp8_rac_get(&s->c);
961  if (s->s.h.comppredmode == PRED_SWITCHABLE)
962  for (i = 0; i < 5; i++)
963  if (vp56_rac_get_prob_branchy(&s->c, 252))
964  s->prob.p.comp[i] =
965  update_prob(&s->c, s->prob.p.comp[i]);
966  } else {
968  }
969 
970  if (s->s.h.comppredmode != PRED_COMPREF) {
971  for (i = 0; i < 5; i++) {
972  if (vp56_rac_get_prob_branchy(&s->c, 252))
973  s->prob.p.single_ref[i][0] =
974  update_prob(&s->c, s->prob.p.single_ref[i][0]);
975  if (vp56_rac_get_prob_branchy(&s->c, 252))
976  s->prob.p.single_ref[i][1] =
977  update_prob(&s->c, s->prob.p.single_ref[i][1]);
978  }
979  }
980 
981  if (s->s.h.comppredmode != PRED_SINGLEREF) {
982  for (i = 0; i < 5; i++)
983  if (vp56_rac_get_prob_branchy(&s->c, 252))
984  s->prob.p.comp_ref[i] =
985  update_prob(&s->c, s->prob.p.comp_ref[i]);
986  }
987 
988  for (i = 0; i < 4; i++)
989  for (j = 0; j < 9; j++)
990  if (vp56_rac_get_prob_branchy(&s->c, 252))
991  s->prob.p.y_mode[i][j] =
992  update_prob(&s->c, s->prob.p.y_mode[i][j]);
993 
994  for (i = 0; i < 4; i++)
995  for (j = 0; j < 4; j++)
996  for (k = 0; k < 3; k++)
997  if (vp56_rac_get_prob_branchy(&s->c, 252))
998  s->prob.p.partition[3 - i][j][k] =
999  update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
1000 
1001  // mv fields don't use the update_prob subexp model for some reason
1002  for (i = 0; i < 3; i++)
1003  if (vp56_rac_get_prob_branchy(&s->c, 252))
1004  s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1005 
1006  for (i = 0; i < 2; i++) {
1007  if (vp56_rac_get_prob_branchy(&s->c, 252))
1008  s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1009 
1010  for (j = 0; j < 10; j++)
1011  if (vp56_rac_get_prob_branchy(&s->c, 252))
1012  s->prob.p.mv_comp[i].classes[j] =
1013  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1014 
1015  if (vp56_rac_get_prob_branchy(&s->c, 252))
1016  s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1017 
1018  for (j = 0; j < 10; j++)
1019  if (vp56_rac_get_prob_branchy(&s->c, 252))
1020  s->prob.p.mv_comp[i].bits[j] =
1021  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1022  }
1023 
1024  for (i = 0; i < 2; i++) {
1025  for (j = 0; j < 2; j++)
1026  for (k = 0; k < 3; k++)
1027  if (vp56_rac_get_prob_branchy(&s->c, 252))
1028  s->prob.p.mv_comp[i].class0_fp[j][k] =
1029  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1030 
1031  for (j = 0; j < 3; j++)
1032  if (vp56_rac_get_prob_branchy(&s->c, 252))
1033  s->prob.p.mv_comp[i].fp[j] =
1034  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1035  }
1036 
1037  if (s->s.h.highprecisionmvs) {
1038  for (i = 0; i < 2; i++) {
1039  if (vp56_rac_get_prob_branchy(&s->c, 252))
1040  s->prob.p.mv_comp[i].class0_hp =
1041  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1042 
1043  if (vp56_rac_get_prob_branchy(&s->c, 252))
1044  s->prob.p.mv_comp[i].hp =
1045  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
1046  }
1047  }
1048  }
1049 
1050  return (data2 - data) + size2;
1051 }
1052 
1053 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
1054  VP9Context *s)
1055 {
1056  dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
1057  dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
1058 }
1059 
1061  VP56mv *pmv, int ref, int z, int idx, int sb)
1062 {
1063  static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
1064  [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
1065  { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
1066  [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
1067  { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
1068  [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
1069  { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
1070  [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
1071  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1072  [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
1073  { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1074  [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
1075  { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
1076  [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
1077  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
1078  [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
1079  { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
1080  [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
1081  { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
1082  [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1083  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1084  [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1085  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1086  [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1087  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1088  [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
1089  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
1090  };
1091  VP9Block *b = s->b;
1092  int row = s->row, col = s->col, row7 = s->row7;
1093  const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
1094 #define INVALID_MV 0x80008000U
1095  uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
1096  int i;
1097 
1098 #define RETURN_DIRECT_MV(mv) \
1099  do { \
1100  uint32_t m = AV_RN32A(&mv); \
1101  if (!idx) { \
1102  AV_WN32A(pmv, m); \
1103  return; \
1104  } else if (mem == INVALID_MV) { \
1105  mem = m; \
1106  } else if (m != mem) { \
1107  AV_WN32A(pmv, m); \
1108  return; \
1109  } \
1110  } while (0)
1111 
1112  if (sb >= 0) {
1113  if (sb == 2 || sb == 1) {
1114  RETURN_DIRECT_MV(b->mv[0][z]);
1115  } else if (sb == 3) {
1116  RETURN_DIRECT_MV(b->mv[2][z]);
1117  RETURN_DIRECT_MV(b->mv[1][z]);
1118  RETURN_DIRECT_MV(b->mv[0][z]);
1119  }
1120 
1121 #define RETURN_MV(mv) \
1122  do { \
1123  if (sb > 0) { \
1124  VP56mv tmp; \
1125  uint32_t m; \
1126  av_assert2(idx == 1); \
1127  av_assert2(mem != INVALID_MV); \
1128  if (mem_sub8x8 == INVALID_MV) { \
1129  clamp_mv(&tmp, &mv, s); \
1130  m = AV_RN32A(&tmp); \
1131  if (m != mem) { \
1132  AV_WN32A(pmv, m); \
1133  return; \
1134  } \
1135  mem_sub8x8 = AV_RN32A(&mv); \
1136  } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
1137  clamp_mv(&tmp, &mv, s); \
1138  m = AV_RN32A(&tmp); \
1139  if (m != mem) { \
1140  AV_WN32A(pmv, m); \
1141  } else { \
1142  /* BUG I'm pretty sure this isn't the intention */ \
1143  AV_WN32A(pmv, 0); \
1144  } \
1145  return; \
1146  } \
1147  } else { \
1148  uint32_t m = AV_RN32A(&mv); \
1149  if (!idx) { \
1150  clamp_mv(pmv, &mv, s); \
1151  return; \
1152  } else if (mem == INVALID_MV) { \
1153  mem = m; \
1154  } else if (m != mem) { \
1155  clamp_mv(pmv, &mv, s); \
1156  return; \
1157  } \
1158  } \
1159  } while (0)
1160 
1161  if (row > 0) {
1162  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1163  if (mv->ref[0] == ref) {
1164  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1165  } else if (mv->ref[1] == ref) {
1166  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1167  }
1168  }
1169  if (col > s->tile_col_start) {
1170  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1171  if (mv->ref[0] == ref) {
1172  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1173  } else if (mv->ref[1] == ref) {
1174  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1175  }
1176  }
1177  i = 2;
1178  } else {
1179  i = 0;
1180  }
1181 
1182  // previously coded MVs in this neighbourhood, using same reference frame
1183  for (; i < 8; i++) {
1184  int c = p[i][0] + col, r = p[i][1] + row;
1185 
1186  if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1187  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1188 
1189  if (mv->ref[0] == ref) {
1190  RETURN_MV(mv->mv[0]);
1191  } else if (mv->ref[1] == ref) {
1192  RETURN_MV(mv->mv[1]);
1193  }
1194  }
1195  }
1196 
1197  // MV at this position in previous frame, using same reference frame
1198  if (s->s.h.use_last_frame_mvs) {
1199  struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1200 
1203  if (mv->ref[0] == ref) {
1204  RETURN_MV(mv->mv[0]);
1205  } else if (mv->ref[1] == ref) {
1206  RETURN_MV(mv->mv[1]);
1207  }
1208  }
1209 
1210 #define RETURN_SCALE_MV(mv, scale) \
1211  do { \
1212  if (scale) { \
1213  VP56mv mv_temp = { -mv.x, -mv.y }; \
1214  RETURN_MV(mv_temp); \
1215  } else { \
1216  RETURN_MV(mv); \
1217  } \
1218  } while (0)
1219 
1220  // previously coded MVs in this neighbourhood, using different reference frame
1221  for (i = 0; i < 8; i++) {
1222  int c = p[i][0] + col, r = p[i][1] + row;
1223 
1224  if (c >= s->tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1225  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1226 
1227  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1228  RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1229  }
1230  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1231  // BUG - libvpx has this condition regardless of whether
1232  // we used the first ref MV and pre-scaling
1233  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1234  RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1235  }
1236  }
1237  }
1238 
1239  // MV at this position in previous frame, using different reference frame
1240  if (s->s.h.use_last_frame_mvs) {
1241  struct VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col];
1242 
1243  // no need to await_progress, because we already did that above
1244  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1245  RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]);
1246  }
1247  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1248  // BUG - libvpx has this condition regardless of whether
1249  // we used the first ref MV and pre-scaling
1250  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1251  RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]);
1252  }
1253  }
1254 
1255  AV_ZERO32(pmv);
1256  clamp_mv(pmv, pmv, s);
1257 #undef INVALID_MV
1258 #undef RETURN_MV
1259 #undef RETURN_SCALE_MV
1260 }
1261 
1262 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1263 {
1264  int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1265  int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1266  s->prob.p.mv_comp[idx].classes);
1267 
1268  s->counts.mv_comp[idx].sign[sign]++;
1269  s->counts.mv_comp[idx].classes[c]++;
1270  if (c) {
1271  int m;
1272 
1273  for (n = 0, m = 0; m < c; m++) {
1274  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1275  n |= bit << m;
1276  s->counts.mv_comp[idx].bits[m][bit]++;
1277  }
1278  n <<= 3;
1279  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1280  n |= bit << 1;
1281  s->counts.mv_comp[idx].fp[bit]++;
1282  if (hp) {
1283  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1284  s->counts.mv_comp[idx].hp[bit]++;
1285  n |= bit;
1286  } else {
1287  n |= 1;
1288  // bug in libvpx - we count for bw entropy purposes even if the
1289  // bit wasn't coded
1290  s->counts.mv_comp[idx].hp[1]++;
1291  }
1292  n += 8 << c;
1293  } else {
1294  n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1295  s->counts.mv_comp[idx].class0[n]++;
1296  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1297  s->prob.p.mv_comp[idx].class0_fp[n]);
1298  s->counts.mv_comp[idx].class0_fp[n][bit]++;
1299  n = (n << 3) | (bit << 1);
1300  if (hp) {
1301  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1302  s->counts.mv_comp[idx].class0_hp[bit]++;
1303  n |= bit;
1304  } else {
1305  n |= 1;
1306  // bug in libvpx - we count for bw entropy purposes even if the
1307  // bit wasn't coded
1308  s->counts.mv_comp[idx].class0_hp[1]++;
1309  }
1310  }
1311 
1312  return sign ? -(n + 1) : (n + 1);
1313 }
1314 
1315 static void fill_mv(VP9Context *s,
1316  VP56mv *mv, int mode, int sb)
1317 {
1318  VP9Block *b = s->b;
1319 
1320  if (mode == ZEROMV) {
1321  AV_ZERO64(mv);
1322  } else {
1323  int hp;
1324 
1325  // FIXME cache this value and reuse for other subblocks
1326  find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1327  mode == NEWMV ? -1 : sb);
1328  // FIXME maybe move this code into find_ref_mvs()
1329  if ((mode == NEWMV || sb == -1) &&
1330  !(hp = s->s.h.highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1331  if (mv[0].y & 1) {
1332  if (mv[0].y < 0)
1333  mv[0].y++;
1334  else
1335  mv[0].y--;
1336  }
1337  if (mv[0].x & 1) {
1338  if (mv[0].x < 0)
1339  mv[0].x++;
1340  else
1341  mv[0].x--;
1342  }
1343  }
1344  if (mode == NEWMV) {
1346  s->prob.p.mv_joint);
1347 
1348  s->counts.mv_joint[j]++;
1349  if (j >= MV_JOINT_V)
1350  mv[0].y += read_mv_component(s, 0, hp);
1351  if (j & 1)
1352  mv[0].x += read_mv_component(s, 1, hp);
1353  }
1354 
1355  if (b->comp) {
1356  // FIXME cache this value and reuse for other subblocks
1357  find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1358  mode == NEWMV ? -1 : sb);
1359  if ((mode == NEWMV || sb == -1) &&
1360  !(hp = s->s.h.highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1361  if (mv[1].y & 1) {
1362  if (mv[1].y < 0)
1363  mv[1].y++;
1364  else
1365  mv[1].y--;
1366  }
1367  if (mv[1].x & 1) {
1368  if (mv[1].x < 0)
1369  mv[1].x++;
1370  else
1371  mv[1].x--;
1372  }
1373  }
1374  if (mode == NEWMV) {
1376  s->prob.p.mv_joint);
1377 
1378  s->counts.mv_joint[j]++;
1379  if (j >= MV_JOINT_V)
1380  mv[1].y += read_mv_component(s, 0, hp);
1381  if (j & 1)
1382  mv[1].x += read_mv_component(s, 1, hp);
1383  }
1384  }
1385  }
1386 }
1387 
1388 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1389  ptrdiff_t stride, int v)
1390 {
1391  switch (w) {
1392  case 1:
1393  do {
1394  *ptr = v;
1395  ptr += stride;
1396  } while (--h);
1397  break;
1398  case 2: {
1399  int v16 = v * 0x0101;
1400  do {
1401  AV_WN16A(ptr, v16);
1402  ptr += stride;
1403  } while (--h);
1404  break;
1405  }
1406  case 4: {
1407  uint32_t v32 = v * 0x01010101;
1408  do {
1409  AV_WN32A(ptr, v32);
1410  ptr += stride;
1411  } while (--h);
1412  break;
1413  }
1414  case 8: {
1415 #if HAVE_FAST_64BIT
1416  uint64_t v64 = v * 0x0101010101010101ULL;
1417  do {
1418  AV_WN64A(ptr, v64);
1419  ptr += stride;
1420  } while (--h);
1421 #else
1422  uint32_t v32 = v * 0x01010101;
1423  do {
1424  AV_WN32A(ptr, v32);
1425  AV_WN32A(ptr + 4, v32);
1426  ptr += stride;
1427  } while (--h);
1428 #endif
1429  break;
1430  }
1431  }
1432 }
1433 
1435 {
1436  static const uint8_t left_ctx[N_BS_SIZES] = {
1437  0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1438  };
1439  static const uint8_t above_ctx[N_BS_SIZES] = {
1440  0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1441  };
1442  static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1444  TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1445  };
1446  VP9Context *s = ctx->priv_data;
1447  VP9Block *b = s->b;
1448  int row = s->row, col = s->col, row7 = s->row7;
1449  enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1450  int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
1451  int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
1452  int have_a = row > 0, have_l = col > s->tile_col_start;
1453  int vref, filter_id;
1454 
1455  if (!s->s.h.segmentation.enabled) {
1456  b->seg_id = 0;
1457  } else if (s->s.h.keyframe || s->s.h.intraonly) {
1458  b->seg_id = !s->s.h.segmentation.update_map ? 0 :
1460  } else if (!s->s.h.segmentation.update_map ||
1461  (s->s.h.segmentation.temporal &&
1464  s->left_segpred_ctx[row7]]))) {
1466  int pred = 8, x;
1468 
1471  for (y = 0; y < h4; y++) {
1472  int idx_base = (y + row) * 8 * s->sb_cols + col;
1473  for (x = 0; x < w4; x++)
1474  pred = FFMIN(pred, refsegmap[idx_base + x]);
1475  }
1476  av_assert1(pred < 8);
1477  b->seg_id = pred;
1478  } else {
1479  b->seg_id = 0;
1480  }
1481 
1482  memset(&s->above_segpred_ctx[col], 1, w4);
1483  memset(&s->left_segpred_ctx[row7], 1, h4);
1484  } else {
1486  s->s.h.segmentation.prob);
1487 
1488  memset(&s->above_segpred_ctx[col], 0, w4);
1489  memset(&s->left_segpred_ctx[row7], 0, h4);
1490  }
1491  if (s->s.h.segmentation.enabled &&
1492  (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
1493  setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1494  bw4, bh4, 8 * s->sb_cols, b->seg_id);
1495  }
1496 
1497  b->skip = s->s.h.segmentation.enabled &&
1498  s->s.h.segmentation.feat[b->seg_id].skip_enabled;
1499  if (!b->skip) {
1500  int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1501  b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1502  s->counts.skip[c][b->skip]++;
1503  }
1504 
1505  if (s->s.h.keyframe || s->s.h.intraonly) {
1506  b->intra = 1;
1507  } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1508  b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
1509  } else {
1510  int c, bit;
1511 
1512  if (have_a && have_l) {
1513  c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1514  c += (c == 2);
1515  } else {
1516  c = have_a ? 2 * s->above_intra_ctx[col] :
1517  have_l ? 2 * s->left_intra_ctx[row7] : 0;
1518  }
1519  bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1520  s->counts.intra[c][bit]++;
1521  b->intra = !bit;
1522  }
1523 
1524  if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
1525  int c;
1526  if (have_a) {
1527  if (have_l) {
1528  c = (s->above_skip_ctx[col] ? max_tx :
1529  s->above_txfm_ctx[col]) +
1530  (s->left_skip_ctx[row7] ? max_tx :
1531  s->left_txfm_ctx[row7]) > max_tx;
1532  } else {
1533  c = s->above_skip_ctx[col] ? 1 :
1534  (s->above_txfm_ctx[col] * 2 > max_tx);
1535  }
1536  } else if (have_l) {
1537  c = s->left_skip_ctx[row7] ? 1 :
1538  (s->left_txfm_ctx[row7] * 2 > max_tx);
1539  } else {
1540  c = 1;
1541  }
1542  switch (max_tx) {
1543  case TX_32X32:
1544  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1545  if (b->tx) {
1546  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1547  if (b->tx == 2)
1548  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1549  }
1550  s->counts.tx32p[c][b->tx]++;
1551  break;
1552  case TX_16X16:
1553  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1554  if (b->tx)
1555  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1556  s->counts.tx16p[c][b->tx]++;
1557  break;
1558  case TX_8X8:
1559  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1560  s->counts.tx8p[c][b->tx]++;
1561  break;
1562  case TX_4X4:
1563  b->tx = TX_4X4;
1564  break;
1565  }
1566  } else {
1567  b->tx = FFMIN(max_tx, s->s.h.txfmmode);
1568  }
1569 
1570  if (s->s.h.keyframe || s->s.h.intraonly) {
1571  uint8_t *a = &s->above_mode_ctx[col * 2];
1572  uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1573 
1574  b->comp = 0;
1575  if (b->bs > BS_8x8) {
1576  // FIXME the memory storage intermediates here aren't really
1577  // necessary, they're just there to make the code slightly
1578  // simpler for now
1579  b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1580  vp9_default_kf_ymode_probs[a[0]][l[0]]);
1581  if (b->bs != BS_8x4) {
1583  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1584  l[0] = a[1] = b->mode[1];
1585  } else {
1586  l[0] = a[1] = b->mode[1] = b->mode[0];
1587  }
1588  if (b->bs != BS_4x8) {
1589  b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1590  vp9_default_kf_ymode_probs[a[0]][l[1]]);
1591  if (b->bs != BS_8x4) {
1593  vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1594  l[1] = a[1] = b->mode[3];
1595  } else {
1596  l[1] = a[1] = b->mode[3] = b->mode[2];
1597  }
1598  } else {
1599  b->mode[2] = b->mode[0];
1600  l[1] = a[1] = b->mode[3] = b->mode[1];
1601  }
1602  } else {
1604  vp9_default_kf_ymode_probs[*a][*l]);
1605  b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1606  // FIXME this can probably be optimized
1607  memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1608  memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1609  }
1612  } else if (b->intra) {
1613  b->comp = 0;
1614  if (b->bs > BS_8x8) {
1616  s->prob.p.y_mode[0]);
1617  s->counts.y_mode[0][b->mode[0]]++;
1618  if (b->bs != BS_8x4) {
1620  s->prob.p.y_mode[0]);
1621  s->counts.y_mode[0][b->mode[1]]++;
1622  } else {
1623  b->mode[1] = b->mode[0];
1624  }
1625  if (b->bs != BS_4x8) {
1627  s->prob.p.y_mode[0]);
1628  s->counts.y_mode[0][b->mode[2]]++;
1629  if (b->bs != BS_8x4) {
1631  s->prob.p.y_mode[0]);
1632  s->counts.y_mode[0][b->mode[3]]++;
1633  } else {
1634  b->mode[3] = b->mode[2];
1635  }
1636  } else {
1637  b->mode[2] = b->mode[0];
1638  b->mode[3] = b->mode[1];
1639  }
1640  } else {
1641  static const uint8_t size_group[10] = {
1642  3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1643  };
1644  int sz = size_group[b->bs];
1645 
1647  s->prob.p.y_mode[sz]);
1648  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1649  s->counts.y_mode[sz][b->mode[3]]++;
1650  }
1652  s->prob.p.uv_mode[b->mode[3]]);
1653  s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1654  } else {
1655  static const uint8_t inter_mode_ctx_lut[14][14] = {
1656  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1657  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1658  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1659  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1660  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1661  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1662  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1663  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1664  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1665  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1666  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1667  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1668  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1669  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1670  };
1671 
1672  if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
1673  av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
1674  b->comp = 0;
1675  b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
1676  } else {
1677  // read comp_pred flag
1678  if (s->s.h.comppredmode != PRED_SWITCHABLE) {
1679  b->comp = s->s.h.comppredmode == PRED_COMPREF;
1680  } else {
1681  int c;
1682 
1683  // FIXME add intra as ref=0xff (or -1) to make these easier?
1684  if (have_a) {
1685  if (have_l) {
1686  if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1687  c = 4;
1688  } else if (s->above_comp_ctx[col]) {
1689  c = 2 + (s->left_intra_ctx[row7] ||
1690  s->left_ref_ctx[row7] == s->s.h.fixcompref);
1691  } else if (s->left_comp_ctx[row7]) {
1692  c = 2 + (s->above_intra_ctx[col] ||
1693  s->above_ref_ctx[col] == s->s.h.fixcompref);
1694  } else {
1695  c = (!s->above_intra_ctx[col] &&
1696  s->above_ref_ctx[col] == s->s.h.fixcompref) ^
1697  (!s->left_intra_ctx[row7] &&
1698  s->left_ref_ctx[row & 7] == s->s.h.fixcompref);
1699  }
1700  } else {
1701  c = s->above_comp_ctx[col] ? 3 :
1702  (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
1703  }
1704  } else if (have_l) {
1705  c = s->left_comp_ctx[row7] ? 3 :
1706  (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->s.h.fixcompref);
1707  } else {
1708  c = 1;
1709  }
1710  b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1711  s->counts.comp[c][b->comp]++;
1712  }
1713 
1714  // read actual references
1715  // FIXME probably cache a few variables here to prevent repetitive
1716  // memory accesses below
1717  if (b->comp) /* two references */ {
1718  int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
1719 
1720  b->ref[fix_idx] = s->s.h.fixcompref;
1721  // FIXME can this codeblob be replaced by some sort of LUT?
1722  if (have_a) {
1723  if (have_l) {
1724  if (s->above_intra_ctx[col]) {
1725  if (s->left_intra_ctx[row7]) {
1726  c = 2;
1727  } else {
1728  c = 1 + 2 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1729  }
1730  } else if (s->left_intra_ctx[row7]) {
1731  c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1732  } else {
1733  int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1734 
1735  if (refl == refa && refa == s->s.h.varcompref[1]) {
1736  c = 0;
1737  } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1738  if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
1739  (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
1740  c = 4;
1741  } else {
1742  c = (refa == refl) ? 3 : 1;
1743  }
1744  } else if (!s->left_comp_ctx[row7]) {
1745  if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
1746  c = 1;
1747  } else {
1748  c = (refl == s->s.h.varcompref[1] &&
1749  refa != s->s.h.varcompref[1]) ? 2 : 4;
1750  }
1751  } else if (!s->above_comp_ctx[col]) {
1752  if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
1753  c = 1;
1754  } else {
1755  c = (refa == s->s.h.varcompref[1] &&
1756  refl != s->s.h.varcompref[1]) ? 2 : 4;
1757  }
1758  } else {
1759  c = (refl == refa) ? 4 : 2;
1760  }
1761  }
1762  } else {
1763  if (s->above_intra_ctx[col]) {
1764  c = 2;
1765  } else if (s->above_comp_ctx[col]) {
1766  c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1767  } else {
1768  c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
1769  }
1770  }
1771  } else if (have_l) {
1772  if (s->left_intra_ctx[row7]) {
1773  c = 2;
1774  } else if (s->left_comp_ctx[row7]) {
1775  c = 4 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1776  } else {
1777  c = 3 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
1778  }
1779  } else {
1780  c = 2;
1781  }
1782  bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1783  b->ref[var_idx] = s->s.h.varcompref[bit];
1784  s->counts.comp_ref[c][bit]++;
1785  } else /* single reference */ {
1786  int bit, c;
1787 
1788  if (have_a && !s->above_intra_ctx[col]) {
1789  if (have_l && !s->left_intra_ctx[row7]) {
1790  if (s->left_comp_ctx[row7]) {
1791  if (s->above_comp_ctx[col]) {
1792  c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7] ||
1793  !s->above_ref_ctx[col]);
1794  } else {
1795  c = (3 * !s->above_ref_ctx[col]) +
1796  (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1797  }
1798  } else if (s->above_comp_ctx[col]) {
1799  c = (3 * !s->left_ref_ctx[row7]) +
1800  (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1801  } else {
1802  c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1803  }
1804  } else if (s->above_intra_ctx[col]) {
1805  c = 2;
1806  } else if (s->above_comp_ctx[col]) {
1807  c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
1808  } else {
1809  c = 4 * (!s->above_ref_ctx[col]);
1810  }
1811  } else if (have_l && !s->left_intra_ctx[row7]) {
1812  if (s->left_intra_ctx[row7]) {
1813  c = 2;
1814  } else if (s->left_comp_ctx[row7]) {
1815  c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
1816  } else {
1817  c = 4 * (!s->left_ref_ctx[row7]);
1818  }
1819  } else {
1820  c = 2;
1821  }
1822  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1823  s->counts.single_ref[c][0][bit]++;
1824  if (!bit) {
1825  b->ref[0] = 0;
1826  } else {
1827  // FIXME can this codeblob be replaced by some sort of LUT?
1828  if (have_a) {
1829  if (have_l) {
1830  if (s->left_intra_ctx[row7]) {
1831  if (s->above_intra_ctx[col]) {
1832  c = 2;
1833  } else if (s->above_comp_ctx[col]) {
1834  c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1835  s->above_ref_ctx[col] == 1);
1836  } else if (!s->above_ref_ctx[col]) {
1837  c = 3;
1838  } else {
1839  c = 4 * (s->above_ref_ctx[col] == 1);
1840  }
1841  } else if (s->above_intra_ctx[col]) {
1842  if (s->left_intra_ctx[row7]) {
1843  c = 2;
1844  } else if (s->left_comp_ctx[row7]) {
1845  c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1846  s->left_ref_ctx[row7] == 1);
1847  } else if (!s->left_ref_ctx[row7]) {
1848  c = 3;
1849  } else {
1850  c = 4 * (s->left_ref_ctx[row7] == 1);
1851  }
1852  } else if (s->above_comp_ctx[col]) {
1853  if (s->left_comp_ctx[row7]) {
1854  if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1855  c = 3 * (s->s.h.fixcompref == 1 ||
1856  s->left_ref_ctx[row7] == 1);
1857  } else {
1858  c = 2;
1859  }
1860  } else if (!s->left_ref_ctx[row7]) {
1861  c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1862  s->above_ref_ctx[col] == 1);
1863  } else {
1864  c = 3 * (s->left_ref_ctx[row7] == 1) +
1865  (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1866  }
1867  } else if (s->left_comp_ctx[row7]) {
1868  if (!s->above_ref_ctx[col]) {
1869  c = 1 + 2 * (s->s.h.fixcompref == 1 ||
1870  s->left_ref_ctx[row7] == 1);
1871  } else {
1872  c = 3 * (s->above_ref_ctx[col] == 1) +
1873  (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1874  }
1875  } else if (!s->above_ref_ctx[col]) {
1876  if (!s->left_ref_ctx[row7]) {
1877  c = 3;
1878  } else {
1879  c = 4 * (s->left_ref_ctx[row7] == 1);
1880  }
1881  } else if (!s->left_ref_ctx[row7]) {
1882  c = 4 * (s->above_ref_ctx[col] == 1);
1883  } else {
1884  c = 2 * (s->left_ref_ctx[row7] == 1) +
1885  2 * (s->above_ref_ctx[col] == 1);
1886  }
1887  } else {
1888  if (s->above_intra_ctx[col] ||
1889  (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1890  c = 2;
1891  } else if (s->above_comp_ctx[col]) {
1892  c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
1893  } else {
1894  c = 4 * (s->above_ref_ctx[col] == 1);
1895  }
1896  }
1897  } else if (have_l) {
1898  if (s->left_intra_ctx[row7] ||
1899  (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1900  c = 2;
1901  } else if (s->left_comp_ctx[row7]) {
1902  c = 3 * (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1903  } else {
1904  c = 4 * (s->left_ref_ctx[row7] == 1);
1905  }
1906  } else {
1907  c = 2;
1908  }
1909  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1910  s->counts.single_ref[c][1][bit]++;
1911  b->ref[0] = 1 + bit;
1912  }
1913  }
1914  }
1915 
1916  if (b->bs <= BS_8x8) {
1917  if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
1918  b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1919  } else {
1920  static const uint8_t off[10] = {
1921  3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1922  };
1923 
1924  // FIXME this needs to use the LUT tables from find_ref_mvs
1925  // because not all are -1,0/0,-1
1926  int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1927  [s->left_mode_ctx[row7 + off[b->bs]]];
1928 
1930  s->prob.p.mv_mode[c]);
1931  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1932  s->counts.mv_mode[c][b->mode[0] - 10]++;
1933  }
1934  }
1935 
1936  if (s->s.h.filtermode == FILTER_SWITCHABLE) {
1937  int c;
1938 
1939  if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1940  if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1941  c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1942  s->left_filter_ctx[row7] : 3;
1943  } else {
1944  c = s->above_filter_ctx[col];
1945  }
1946  } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1947  c = s->left_filter_ctx[row7];
1948  } else {
1949  c = 3;
1950  }
1951 
1952  filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1953  s->prob.p.filter[c]);
1954  s->counts.filter[c][filter_id]++;
1955  b->filter = vp9_filter_lut[filter_id];
1956  } else {
1957  b->filter = s->s.h.filtermode;
1958  }
1959 
1960  if (b->bs > BS_8x8) {
1961  int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1962 
1964  s->prob.p.mv_mode[c]);
1965  s->counts.mv_mode[c][b->mode[0] - 10]++;
1966  fill_mv(s, b->mv[0], b->mode[0], 0);
1967 
1968  if (b->bs != BS_8x4) {
1970  s->prob.p.mv_mode[c]);
1971  s->counts.mv_mode[c][b->mode[1] - 10]++;
1972  fill_mv(s, b->mv[1], b->mode[1], 1);
1973  } else {
1974  b->mode[1] = b->mode[0];
1975  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1976  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1977  }
1978 
1979  if (b->bs != BS_4x8) {
1981  s->prob.p.mv_mode[c]);
1982  s->counts.mv_mode[c][b->mode[2] - 10]++;
1983  fill_mv(s, b->mv[2], b->mode[2], 2);
1984 
1985  if (b->bs != BS_8x4) {
1987  s->prob.p.mv_mode[c]);
1988  s->counts.mv_mode[c][b->mode[3] - 10]++;
1989  fill_mv(s, b->mv[3], b->mode[3], 3);
1990  } else {
1991  b->mode[3] = b->mode[2];
1992  AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1993  AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1994  }
1995  } else {
1996  b->mode[2] = b->mode[0];
1997  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1998  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1999  b->mode[3] = b->mode[1];
2000  AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
2001  AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
2002  }
2003  } else {
2004  fill_mv(s, b->mv[0], b->mode[0], -1);
2005  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
2006  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
2007  AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
2008  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
2009  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
2010  AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
2011  }
2012 
2013  vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
2014  }
2015 
2016 #if HAVE_FAST_64BIT
2017 #define SPLAT_CTX(var, val, n) \
2018  switch (n) { \
2019  case 1: var = val; break; \
2020  case 2: AV_WN16A(&var, val * 0x0101); break; \
2021  case 4: AV_WN32A(&var, val * 0x01010101); break; \
2022  case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
2023  case 16: { \
2024  uint64_t v64 = val * 0x0101010101010101ULL; \
2025  AV_WN64A( &var, v64); \
2026  AV_WN64A(&((uint8_t *) &var)[8], v64); \
2027  break; \
2028  } \
2029  }
2030 #else
2031 #define SPLAT_CTX(var, val, n) \
2032  switch (n) { \
2033  case 1: var = val; break; \
2034  case 2: AV_WN16A(&var, val * 0x0101); break; \
2035  case 4: AV_WN32A(&var, val * 0x01010101); break; \
2036  case 8: { \
2037  uint32_t v32 = val * 0x01010101; \
2038  AV_WN32A( &var, v32); \
2039  AV_WN32A(&((uint8_t *) &var)[4], v32); \
2040  break; \
2041  } \
2042  case 16: { \
2043  uint32_t v32 = val * 0x01010101; \
2044  AV_WN32A( &var, v32); \
2045  AV_WN32A(&((uint8_t *) &var)[4], v32); \
2046  AV_WN32A(&((uint8_t *) &var)[8], v32); \
2047  AV_WN32A(&((uint8_t *) &var)[12], v32); \
2048  break; \
2049  } \
2050  }
2051 #endif
2052 
2053  switch (bwh_tab[1][b->bs][0]) {
2054 #define SET_CTXS(dir, off, n) \
2055  do { \
2056  SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
2057  SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
2058  SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
2059  if (!s->s.h.keyframe && !s->s.h.intraonly) { \
2060  SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
2061  SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
2062  SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
2063  if (!b->intra) { \
2064  SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
2065  if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
2066  SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
2067  } \
2068  } \
2069  } \
2070  } while (0)
2071  case 1: SET_CTXS(above, col, 1); break;
2072  case 2: SET_CTXS(above, col, 2); break;
2073  case 4: SET_CTXS(above, col, 4); break;
2074  case 8: SET_CTXS(above, col, 8); break;
2075  }
2076  switch (bwh_tab[1][b->bs][1]) {
2077  case 1: SET_CTXS(left, row7, 1); break;
2078  case 2: SET_CTXS(left, row7, 2); break;
2079  case 4: SET_CTXS(left, row7, 4); break;
2080  case 8: SET_CTXS(left, row7, 8); break;
2081  }
2082 #undef SPLAT_CTX
2083 #undef SET_CTXS
2084 
2085  if (!s->s.h.keyframe && !s->s.h.intraonly) {
2086  if (b->bs > BS_8x8) {
2087  int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2088 
2089  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
2090  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
2091  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
2092  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
2093  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
2094  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
2095  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
2096  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
2097  } else {
2098  int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
2099 
2100  for (n = 0; n < w4 * 2; n++) {
2101  AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
2102  AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
2103  }
2104  for (n = 0; n < h4 * 2; n++) {
2105  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
2106  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
2107  }
2108  }
2109  }
2110 
2111  // FIXME kinda ugly
2112  for (y = 0; y < h4; y++) {
2113  int x, o = (row + y) * s->sb_cols * 8 + col;
2114  struct VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
2115 
2116  if (b->intra) {
2117  for (x = 0; x < w4; x++) {
2118  mv[x].ref[0] =
2119  mv[x].ref[1] = -1;
2120  }
2121  } else if (b->comp) {
2122  for (x = 0; x < w4; x++) {
2123  mv[x].ref[0] = b->ref[0];
2124  mv[x].ref[1] = b->ref[1];
2125  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2126  AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2127  }
2128  } else {
2129  for (x = 0; x < w4; x++) {
2130  mv[x].ref[0] = b->ref[0];
2131  mv[x].ref[1] = -1;
2132  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2133  }
2134  }
2135  }
2136 }
2137 
2138 // FIXME merge cnt/eob arguments?
2139 static av_always_inline int
2140 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2141  int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
2142  unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2143  int nnz, const int16_t *scan, const int16_t (*nb)[2],
2144  const int16_t *band_counts, const int16_t *qmul)
2145 {
2146  int i = 0, band = 0, band_left = band_counts[band];
2147  uint8_t *tp = p[0][nnz];
2148  uint8_t cache[1024];
2149 
2150  do {
2151  int val, rc;
2152 
2153  val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2154  eob[band][nnz][val]++;
2155  if (!val)
2156  break;
2157 
2158  skip_eob:
2159  if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2160  cnt[band][nnz][0]++;
2161  if (!--band_left)
2162  band_left = band_counts[++band];
2163  cache[scan[i]] = 0;
2164  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2165  tp = p[band][nnz];
2166  if (++i == n_coeffs)
2167  break; //invalid input; blocks should end with EOB
2168  goto skip_eob;
2169  }
2170 
2171  rc = scan[i];
2172  if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2173  cnt[band][nnz][1]++;
2174  val = 1;
2175  cache[rc] = 1;
2176  } else {
2177  // fill in p[3-10] (model fill) - only once per frame for each pos
2178  if (!tp[3])
2179  memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2180 
2181  cnt[band][nnz][2]++;
2182  if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2183  if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2184  cache[rc] = val = 2;
2185  } else {
2186  val = 3 + vp56_rac_get_prob(c, tp[5]);
2187  cache[rc] = 3;
2188  }
2189  } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2190  cache[rc] = 4;
2191  if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2192  val = 5 + vp56_rac_get_prob(c, 159);
2193  } else {
2194  val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2195  val += vp56_rac_get_prob(c, 145);
2196  }
2197  } else { // cat 3-6
2198  cache[rc] = 5;
2199  if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2200  if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2201  val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2202  val += (vp56_rac_get_prob(c, 148) << 1);
2203  val += vp56_rac_get_prob(c, 140);
2204  } else {
2205  val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2206  val += (vp56_rac_get_prob(c, 155) << 2);
2207  val += (vp56_rac_get_prob(c, 140) << 1);
2208  val += vp56_rac_get_prob(c, 135);
2209  }
2210  } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2211  val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2212  val += (vp56_rac_get_prob(c, 157) << 3);
2213  val += (vp56_rac_get_prob(c, 141) << 2);
2214  val += (vp56_rac_get_prob(c, 134) << 1);
2215  val += vp56_rac_get_prob(c, 130);
2216  } else {
2217  val = 67;
2218  if (!is8bitsperpixel) {
2219  if (bpp == 12) {
2220  val += vp56_rac_get_prob(c, 255) << 17;
2221  val += vp56_rac_get_prob(c, 255) << 16;
2222  }
2223  val += (vp56_rac_get_prob(c, 255) << 15);
2224  val += (vp56_rac_get_prob(c, 255) << 14);
2225  }
2226  val += (vp56_rac_get_prob(c, 254) << 13);
2227  val += (vp56_rac_get_prob(c, 254) << 12);
2228  val += (vp56_rac_get_prob(c, 254) << 11);
2229  val += (vp56_rac_get_prob(c, 252) << 10);
2230  val += (vp56_rac_get_prob(c, 249) << 9);
2231  val += (vp56_rac_get_prob(c, 243) << 8);
2232  val += (vp56_rac_get_prob(c, 230) << 7);
2233  val += (vp56_rac_get_prob(c, 196) << 6);
2234  val += (vp56_rac_get_prob(c, 177) << 5);
2235  val += (vp56_rac_get_prob(c, 153) << 4);
2236  val += (vp56_rac_get_prob(c, 140) << 3);
2237  val += (vp56_rac_get_prob(c, 133) << 2);
2238  val += (vp56_rac_get_prob(c, 130) << 1);
2239  val += vp56_rac_get_prob(c, 129);
2240  }
2241  }
2242  }
2243 #define STORE_COEF(c, i, v) do { \
2244  if (is8bitsperpixel) { \
2245  c[i] = v; \
2246  } else { \
2247  AV_WN32A(&c[i * 2], v); \
2248  } \
2249 } while (0)
2250  if (!--band_left)
2251  band_left = band_counts[++band];
2252  if (is_tx32x32)
2253  STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
2254  else
2255  STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
2256  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2257  tp = p[band][nnz];
2258  } while (++i < n_coeffs);
2259 
2260  return i;
2261 }
2262 
2263 static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2264  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2265  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2266  const int16_t (*nb)[2], const int16_t *band_counts,
2267  const int16_t *qmul)
2268 {
2269  return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
2270  nnz, scan, nb, band_counts, qmul);
2271 }
2272 
2273 static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2274  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2275  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2276  const int16_t (*nb)[2], const int16_t *band_counts,
2277  const int16_t *qmul)
2278 {
2279  return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
2280  nnz, scan, nb, band_counts, qmul);
2281 }
2282 
2283 static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2284  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2285  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2286  const int16_t (*nb)[2], const int16_t *band_counts,
2287  const int16_t *qmul)
2288 {
2289  return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->s.h.bpp, cnt, eob, p,
2290  nnz, scan, nb, band_counts, qmul);
2291 }
2292 
2293 static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
2294  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2295  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2296  const int16_t (*nb)[2], const int16_t *band_counts,
2297  const int16_t *qmul)
2298 {
2299  return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->s.h.bpp, cnt, eob, p,
2300  nnz, scan, nb, band_counts, qmul);
2301 }
2302 
2303 static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
2304 {
2305  VP9Context *s = ctx->priv_data;
2306  VP9Block *b = s->b;
2307  int row = s->row, col = s->col;
2308  uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2309  unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2310  unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2311  int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2312  int end_x = FFMIN(2 * (s->cols - col), w4);
2313  int end_y = FFMIN(2 * (s->rows - row), h4);
2314  int n, pl, x, y, res;
2315  int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
2316  int tx = 4 * s->s.h.lossless + b->tx;
2317  const int16_t * const *yscans = vp9_scans[tx];
2318  const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2319  const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2320  const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2321  uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2322  uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2323  static const int16_t band_counts[4][8] = {
2324  { 1, 2, 3, 4, 3, 16 - 13 },
2325  { 1, 2, 3, 4, 11, 64 - 21 },
2326  { 1, 2, 3, 4, 11, 256 - 21 },
2327  { 1, 2, 3, 4, 11, 1024 - 21 },
2328  };
2329  const int16_t *y_band_counts = band_counts[b->tx];
2330  const int16_t *uv_band_counts = band_counts[b->uvtx];
2331  int bytesperpixel = is8bitsperpixel ? 1 : 2;
2332  int total_coeff = 0;
2333 
2334 #define MERGE(la, end, step, rd) \
2335  for (n = 0; n < end; n += step) \
2336  la[n] = !!rd(&la[n])
2337 #define MERGE_CTX(step, rd) \
2338  do { \
2339  MERGE(l, end_y, step, rd); \
2340  MERGE(a, end_x, step, rd); \
2341  } while (0)
2342 
2343 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2344  for (n = 0, y = 0; y < end_y; y += step) { \
2345  for (x = 0; x < end_x; x += step, n += step * step) { \
2346  enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2347  res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2348  (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
2349  c, e, p, a[x] + l[y], yscans[txtp], \
2350  ynbs[txtp], y_band_counts, qmul[0]); \
2351  a[x] = l[y] = !!res; \
2352  total_coeff |= !!res; \
2353  if (step >= 4) { \
2354  AV_WN16A(&s->eob[n], res); \
2355  } else { \
2356  s->eob[n] = res; \
2357  } \
2358  } \
2359  }
2360 
2361 #define SPLAT(la, end, step, cond) \
2362  if (step == 2) { \
2363  for (n = 1; n < end; n += step) \
2364  la[n] = la[n - 1]; \
2365  } else if (step == 4) { \
2366  if (cond) { \
2367  for (n = 0; n < end; n += step) \
2368  AV_WN32A(&la[n], la[n] * 0x01010101); \
2369  } else { \
2370  for (n = 0; n < end; n += step) \
2371  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2372  } \
2373  } else /* step == 8 */ { \
2374  if (cond) { \
2375  if (HAVE_FAST_64BIT) { \
2376  for (n = 0; n < end; n += step) \
2377  AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2378  } else { \
2379  for (n = 0; n < end; n += step) { \
2380  uint32_t v32 = la[n] * 0x01010101; \
2381  AV_WN32A(&la[n], v32); \
2382  AV_WN32A(&la[n + 4], v32); \
2383  } \
2384  } \
2385  } else { \
2386  for (n = 0; n < end; n += step) \
2387  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2388  } \
2389  }
2390 #define SPLAT_CTX(step) \
2391  do { \
2392  SPLAT(a, end_x, step, end_x == w4); \
2393  SPLAT(l, end_y, step, end_y == h4); \
2394  } while (0)
2395 
2396  /* y tokens */
2397  switch (b->tx) {
2398  case TX_4X4:
2399  DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2400  break;
2401  case TX_8X8:
2402  MERGE_CTX(2, AV_RN16A);
2403  DECODE_Y_COEF_LOOP(2, 0,);
2404  SPLAT_CTX(2);
2405  break;
2406  case TX_16X16:
2407  MERGE_CTX(4, AV_RN32A);
2408  DECODE_Y_COEF_LOOP(4, 0,);
2409  SPLAT_CTX(4);
2410  break;
2411  case TX_32X32:
2412  MERGE_CTX(8, AV_RN64A);
2413  DECODE_Y_COEF_LOOP(8, 0, 32);
2414  SPLAT_CTX(8);
2415  break;
2416  }
2417 
2418 #define DECODE_UV_COEF_LOOP(step, v) \
2419  for (n = 0, y = 0; y < end_y; y += step) { \
2420  for (x = 0; x < end_x; x += step, n += step * step) { \
2421  res = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
2422  (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
2423  16 * step * step, c, e, p, a[x] + l[y], \
2424  uvscan, uvnb, uv_band_counts, qmul[1]); \
2425  a[x] = l[y] = !!res; \
2426  total_coeff |= !!res; \
2427  if (step >= 4) { \
2428  AV_WN16A(&s->uveob[pl][n], res); \
2429  } else { \
2430  s->uveob[pl][n] = res; \
2431  } \
2432  } \
2433  }
2434 
2435  p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2436  c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2437  e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2438  w4 >>= s->ss_h;
2439  end_x >>= s->ss_h;
2440  h4 >>= s->ss_v;
2441  end_y >>= s->ss_v;
2442  for (pl = 0; pl < 2; pl++) {
2443  a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
2444  l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
2445  switch (b->uvtx) {
2446  case TX_4X4:
2447  DECODE_UV_COEF_LOOP(1,);
2448  break;
2449  case TX_8X8:
2450  MERGE_CTX(2, AV_RN16A);
2451  DECODE_UV_COEF_LOOP(2,);
2452  SPLAT_CTX(2);
2453  break;
2454  case TX_16X16:
2455  MERGE_CTX(4, AV_RN32A);
2456  DECODE_UV_COEF_LOOP(4,);
2457  SPLAT_CTX(4);
2458  break;
2459  case TX_32X32:
2460  MERGE_CTX(8, AV_RN64A);
2461  DECODE_UV_COEF_LOOP(8, 32);
2462  SPLAT_CTX(8);
2463  break;
2464  }
2465  }
2466 
2467  return total_coeff;
2468 }
2469 
2471 {
2472  return decode_coeffs(ctx, 1);
2473 }
2474 
2476 {
2477  return decode_coeffs(ctx, 0);
2478 }
2479 
2481  uint8_t *dst_edge, ptrdiff_t stride_edge,
2482  uint8_t *dst_inner, ptrdiff_t stride_inner,
2483  uint8_t *l, int col, int x, int w,
2484  int row, int y, enum TxfmMode tx,
2485  int p, int ss_h, int ss_v, int bytesperpixel)
2486 {
2487  int have_top = row > 0 || y > 0;
2488  int have_left = col > s->tile_col_start || x > 0;
2489  int have_right = x < w - 1;
2490  int bpp = s->s.h.bpp;
2491  static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2492  [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2493  { DC_127_PRED, VERT_PRED } },
2494  [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2495  { HOR_PRED, HOR_PRED } },
2496  [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2497  { LEFT_DC_PRED, DC_PRED } },
2507  { DC_127_PRED, VERT_LEFT_PRED } },
2508  [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2509  { HOR_UP_PRED, HOR_UP_PRED } },
2510  [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2511  { HOR_PRED, TM_VP8_PRED } },
2512  };
2513  static const struct {
2514  uint8_t needs_left:1;
2515  uint8_t needs_top:1;
2516  uint8_t needs_topleft:1;
2517  uint8_t needs_topright:1;
2518  uint8_t invert_left:1;
2519  } edges[N_INTRA_PRED_MODES] = {
2520  [VERT_PRED] = { .needs_top = 1 },
2521  [HOR_PRED] = { .needs_left = 1 },
2522  [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2523  [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2524  [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2525  [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2526  [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2527  [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2528  [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2529  [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2530  [LEFT_DC_PRED] = { .needs_left = 1 },
2531  [TOP_DC_PRED] = { .needs_top = 1 },
2532  [DC_128_PRED] = { 0 },
2533  [DC_127_PRED] = { 0 },
2534  [DC_129_PRED] = { 0 }
2535  };
2536 
2537  av_assert2(mode >= 0 && mode < 10);
2538  mode = mode_conv[mode][have_left][have_top];
2539  if (edges[mode].needs_top) {
2540  uint8_t *top, *topleft;
2541  int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
2542  int n_px_need_tr = 0;
2543 
2544  if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2545  n_px_need_tr = 4;
2546 
2547  // if top of sb64-row, use s->intra_pred_data[] instead of
2548  // dst[-stride] for intra prediction (it contains pre- instead of
2549  // post-loopfilter data)
2550  if (have_top) {
2551  top = !(row & 7) && !y ?
2552  s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2553  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2554  if (have_left)
2555  topleft = !(row & 7) && !y ?
2556  s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
2557  y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2558  &dst_inner[-stride_inner];
2559  }
2560 
2561  if (have_top &&
2562  (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2563  (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2564  n_px_need + n_px_need_tr <= n_px_have) {
2565  *a = top;
2566  } else {
2567  if (have_top) {
2568  if (n_px_need <= n_px_have) {
2569  memcpy(*a, top, n_px_need * bytesperpixel);
2570  } else {
2571 #define memset_bpp(c, i1, v, i2, num) do { \
2572  if (bytesperpixel == 1) { \
2573  memset(&(c)[(i1)], (v)[(i2)], (num)); \
2574  } else { \
2575  int n, val = AV_RN16A(&(v)[(i2) * 2]); \
2576  for (n = 0; n < (num); n++) { \
2577  AV_WN16A(&(c)[((i1) + n) * 2], val); \
2578  } \
2579  } \
2580 } while (0)
2581  memcpy(*a, top, n_px_have * bytesperpixel);
2582  memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
2583  }
2584  } else {
2585 #define memset_val(c, val, num) do { \
2586  if (bytesperpixel == 1) { \
2587  memset((c), (val), (num)); \
2588  } else { \
2589  int n; \
2590  for (n = 0; n < (num); n++) { \
2591  AV_WN16A(&(c)[n * 2], (val)); \
2592  } \
2593  } \
2594 } while (0)
2595  memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
2596  }
2597  if (edges[mode].needs_topleft) {
2598  if (have_left && have_top) {
2599 #define assign_bpp(c, i1, v, i2) do { \
2600  if (bytesperpixel == 1) { \
2601  (c)[(i1)] = (v)[(i2)]; \
2602  } else { \
2603  AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
2604  } \
2605 } while (0)
2606  assign_bpp(*a, -1, topleft, -1);
2607  } else {
2608 #define assign_val(c, i, v) do { \
2609  if (bytesperpixel == 1) { \
2610  (c)[(i)] = (v); \
2611  } else { \
2612  AV_WN16A(&(c)[(i) * 2], (v)); \
2613  } \
2614 } while (0)
2615  assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
2616  }
2617  }
2618  if (tx == TX_4X4 && edges[mode].needs_topright) {
2619  if (have_top && have_right &&
2620  n_px_need + n_px_need_tr <= n_px_have) {
2621  memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
2622  } else {
2623  memset_bpp(*a, 4, *a, 3, 4);
2624  }
2625  }
2626  }
2627  }
2628  if (edges[mode].needs_left) {
2629  if (have_left) {
2630  int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
2631  uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2632  ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2633 
2634  if (edges[mode].invert_left) {
2635  if (n_px_need <= n_px_have) {
2636  for (i = 0; i < n_px_need; i++)
2637  assign_bpp(l, i, &dst[i * stride], -1);
2638  } else {
2639  for (i = 0; i < n_px_have; i++)
2640  assign_bpp(l, i, &dst[i * stride], -1);
2641  memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
2642  }
2643  } else {
2644  if (n_px_need <= n_px_have) {
2645  for (i = 0; i < n_px_need; i++)
2646  assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2647  } else {
2648  for (i = 0; i < n_px_have; i++)
2649  assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
2650  memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
2651  }
2652  }
2653  } else {
2654  memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
2655  }
2656  }
2657 
2658  return mode;
2659 }
2660 
2661 static av_always_inline void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off,
2662  ptrdiff_t uv_off, int bytesperpixel)
2663 {
2664  VP9Context *s = ctx->priv_data;
2665  VP9Block *b = s->b;
2666  int row = s->row, col = s->col;
2667  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2668  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2669  int end_x = FFMIN(2 * (s->cols - col), w4);
2670  int end_y = FFMIN(2 * (s->rows - row), h4);
2671  int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
2672  int uvstep1d = 1 << b->uvtx, p;
2673  uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
2674  LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
2675  LOCAL_ALIGNED_32(uint8_t, l, [64]);
2676 
2677  for (n = 0, y = 0; y < end_y; y += step1d) {
2678  uint8_t *ptr = dst, *ptr_r = dst_r;
2679  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
2680  ptr_r += 4 * step1d * bytesperpixel, n += step) {
2681  int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2682  y * 2 + x : 0];
2683  uint8_t *a = &a_buf[32];
2684  enum TxfmType txtp = vp9_intra_txfm_type[mode];
2685  int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2686 
2687  mode = check_intra_mode(s, mode, &a, ptr_r,
2688  s->s.frames[CUR_FRAME].tf.f->linesize[0],
2689  ptr, s->y_stride, l,
2690  col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
2691  s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2692  if (eob)
2693  s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2694  s->block + 16 * n * bytesperpixel, eob);
2695  }
2696  dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
2697  dst += 4 * step1d * s->y_stride;
2698  }
2699 
2700  // U/V
2701  w4 >>= s->ss_h;
2702  end_x >>= s->ss_h;
2703  end_y >>= s->ss_v;
2704  step = 1 << (b->uvtx * 2);
2705  for (p = 0; p < 2; p++) {
2706  dst = s->dst[1 + p];
2707  dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2708  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2709  uint8_t *ptr = dst, *ptr_r = dst_r;
2710  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
2711  ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
2712  int mode = b->uvmode;
2713  uint8_t *a = &a_buf[32];
2714  int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2715 
2716  mode = check_intra_mode(s, mode, &a, ptr_r,
2717  s->s.frames[CUR_FRAME].tf.f->linesize[1],
2718  ptr, s->uv_stride, l, col, x, w4, row, y,
2719  b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
2720  s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2721  if (eob)
2722  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2723  s->uvblock[p] + 16 * n * bytesperpixel, eob);
2724  }
2725  dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
2726  dst += 4 * uvstep1d * s->uv_stride;
2727  }
2728  }
2729 }
2730 
2731 static void intra_recon_8bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2732 {
2733  intra_recon(ctx, y_off, uv_off, 1);
2734 }
2735 
2736 static void intra_recon_16bpp(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2737 {
2738  intra_recon(ctx, y_off, uv_off, 2);
2739 }
2740 
2742  uint8_t *dst, ptrdiff_t dst_stride,
2743  const uint8_t *ref, ptrdiff_t ref_stride,
2745  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2746  int bw, int bh, int w, int h, int bytesperpixel)
2747 {
2748  int mx = mv->x, my = mv->y, th;
2749 
2750  y += my >> 3;
2751  x += mx >> 3;
2752  ref += y * ref_stride + x * bytesperpixel;
2753  mx &= 7;
2754  my &= 7;
2755  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2756  // we use +7 because the last 7 pixels of each sbrow can be changed in
2757  // the longest loopfilter of the next sbrow
2758  th = (y + bh + 4 * !!my + 7) >> 6;
2759  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2760  // The arm/aarch64 _hv filters read one more row than what actually is
2761  // needed, so switch to emulated edge one pixel sooner vertically
2762  // (!!my * 5) than horizontally (!!mx * 4).
2763  if (x < !!mx * 3 || y < !!my * 3 ||
2764  x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
2766  ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
2767  160, ref_stride,
2768  bw + !!mx * 7, bh + !!my * 7,
2769  x - !!mx * 3, y - !!my * 3, w, h);
2770  ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2771  ref_stride = 160;
2772  }
2773  mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2774 }
2775 
2777  uint8_t *dst_u, uint8_t *dst_v,
2778  ptrdiff_t dst_stride,
2779  const uint8_t *ref_u, ptrdiff_t src_stride_u,
2780  const uint8_t *ref_v, ptrdiff_t src_stride_v,
2782  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2783  int bw, int bh, int w, int h, int bytesperpixel)
2784 {
2785  int mx = mv->x * (1 << !s->ss_h), my = mv->y * (1 << !s->ss_v), th;
2786 
2787  y += my >> 4;
2788  x += mx >> 4;
2789  ref_u += y * src_stride_u + x * bytesperpixel;
2790  ref_v += y * src_stride_v + x * bytesperpixel;
2791  mx &= 15;
2792  my &= 15;
2793  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2794  // we use +7 because the last 7 pixels of each sbrow can be changed in
2795  // the longest loopfilter of the next sbrow
2796  th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
2797  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2798  // The arm/aarch64 _hv filters read one more row than what actually is
2799  // needed, so switch to emulated edge one pixel sooner vertically
2800  // (!!my * 5) than horizontally (!!mx * 4).
2801  if (x < !!mx * 3 || y < !!my * 3 ||
2802  x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
2804  ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
2805  160, src_stride_u,
2806  bw + !!mx * 7, bh + !!my * 7,
2807  x - !!mx * 3, y - !!my * 3, w, h);
2808  ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2809  mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
2810 
2812  ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
2813  160, src_stride_v,
2814  bw + !!mx * 7, bh + !!my * 7,
2815  x - !!mx * 3, y - !!my * 3, w, h);
2816  ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
2817  mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
2818  } else {
2819  mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2820  mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2821  }
2822 }
2823 
2824 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2825  px, py, pw, ph, bw, bh, w, h, i) \
2826  mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2827  mv, bw, bh, w, h, bytesperpixel)
2828 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2829  row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2830  mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2831  row, col, mv, bw, bh, w, h, bytesperpixel)
2832 #define SCALED 0
2833 #define FN(x) x##_8bpp
2834 #define BYTES_PER_PIXEL 1
2835 #include "vp9_mc_template.c"
2836 #undef FN
2837 #undef BYTES_PER_PIXEL
2838 #define FN(x) x##_16bpp
2839 #define BYTES_PER_PIXEL 2
2840 #include "vp9_mc_template.c"
2841 #undef mc_luma_dir
2842 #undef mc_chroma_dir
2843 #undef FN
2844 #undef BYTES_PER_PIXEL
2845 #undef SCALED
2846 
2848  vp9_mc_func (*mc)[2],
2849  uint8_t *dst, ptrdiff_t dst_stride,
2850  const uint8_t *ref, ptrdiff_t ref_stride,
2852  ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2853  int px, int py, int pw, int ph,
2854  int bw, int bh, int w, int h, int bytesperpixel,
2855  const uint16_t *scale, const uint8_t *step)
2856 {
2857  if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2858  s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2859  mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
2860  y, x, in_mv, bw, bh, w, h, bytesperpixel);
2861  } else {
2862 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
2863  int mx, my;
2864  int refbw_m1, refbh_m1;
2865  int th;
2866  VP56mv mv;
2867 
2868  mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
2869  mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
2870  // BUG libvpx seems to scale the two components separately. This introduces
2871  // rounding errors but we have to reproduce them to be exactly compatible
2872  // with the output from libvpx...
2873  mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2874  my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2875 
2876  y = my >> 4;
2877  x = mx >> 4;
2878  ref += y * ref_stride + x * bytesperpixel;
2879  mx &= 15;
2880  my &= 15;
2881  refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2882  refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2883  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2884  // we use +7 because the last 7 pixels of each sbrow can be changed in
2885  // the longest loopfilter of the next sbrow
2886  th = (y + refbh_m1 + 4 + 7) >> 6;
2887  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2888  // The arm/aarch64 _hv filters read one more row than what actually is
2889  // needed, so switch to emulated edge one pixel sooner vertically
2890  // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
2891  if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
2893  ref - 3 * ref_stride - 3 * bytesperpixel,
2894  288, ref_stride,
2895  refbw_m1 + 8, refbh_m1 + 8,
2896  x - 3, y - 3, w, h);
2897  ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2898  ref_stride = 288;
2899  }
2900  smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
2901  }
2902 }
2903 
2905  vp9_mc_func (*mc)[2],
2906  uint8_t *dst_u, uint8_t *dst_v,
2907  ptrdiff_t dst_stride,
2908  const uint8_t *ref_u, ptrdiff_t src_stride_u,
2909  const uint8_t *ref_v, ptrdiff_t src_stride_v,
2911  ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
2912  int px, int py, int pw, int ph,
2913  int bw, int bh, int w, int h, int bytesperpixel,
2914  const uint16_t *scale, const uint8_t *step)
2915 {
2916  if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
2917  s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
2918  mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
2919  ref_v, src_stride_v, ref_frame,
2920  y, x, in_mv, bw, bh, w, h, bytesperpixel);
2921  } else {
2922  int mx, my;
2923  int refbw_m1, refbh_m1;
2924  int th;
2925  VP56mv mv;
2926 
2927  if (s->ss_h) {
2928  // BUG https://code.google.com/p/webm/issues/detail?id=820
2929  mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 16, (s->cols * 4 - x + px + 3) * 16);
2930  mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
2931  } else {
2932  mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
2933  mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
2934  }
2935  if (s->ss_v) {
2936  // BUG https://code.google.com/p/webm/issues/detail?id=820
2937  mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 16, (s->rows * 4 - y + py + 3) * 16);
2938  my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
2939  } else {
2940  mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
2941  my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
2942  }
2943 #undef scale_mv
2944  y = my >> 4;
2945  x = mx >> 4;
2946  ref_u += y * src_stride_u + x * bytesperpixel;
2947  ref_v += y * src_stride_v + x * bytesperpixel;
2948  mx &= 15;
2949  my &= 15;
2950  refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
2951  refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
2952  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2953  // we use +7 because the last 7 pixels of each sbrow can be changed in
2954  // the longest loopfilter of the next sbrow
2955  th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
2956  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2957  // The arm/aarch64 _hv filters read one more row than what actually is
2958  // needed, so switch to emulated edge one pixel sooner vertically
2959  // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
2960  if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
2962  ref_u - 3 * src_stride_u - 3 * bytesperpixel,
2963  288, src_stride_u,
2964  refbw_m1 + 8, refbh_m1 + 8,
2965  x - 3, y - 3, w, h);
2966  ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2967  smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
2968 
2970  ref_v - 3 * src_stride_v - 3 * bytesperpixel,
2971  288, src_stride_v,
2972  refbw_m1 + 8, refbh_m1 + 8,
2973  x - 3, y - 3, w, h);
2974  ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
2975  smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
2976  } else {
2977  smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
2978  smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
2979  }
2980  }
2981 }
2982 
2983 #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
2984  px, py, pw, ph, bw, bh, w, h, i) \
2985  mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
2986  mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2987  s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2988 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2989  row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
2990  mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
2991  row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
2992  s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
2993 #define SCALED 1
2994 #define FN(x) x##_scaled_8bpp
2995 #define BYTES_PER_PIXEL 1
2996 #include "vp9_mc_template.c"
2997 #undef FN
2998 #undef BYTES_PER_PIXEL
2999 #define FN(x) x##_scaled_16bpp
3000 #define BYTES_PER_PIXEL 2
3001 #include "vp9_mc_template.c"
3002 #undef mc_luma_dir
3003 #undef mc_chroma_dir
3004 #undef FN
3005 #undef BYTES_PER_PIXEL
3006 #undef SCALED
3007 
3008 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
3009 {
3010  VP9Context *s = ctx->priv_data;
3011  VP9Block *b = s->b;
3012  int row = s->row, col = s->col;
3013 
3014  if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
3015  if (bytesperpixel == 1) {
3016  inter_pred_scaled_8bpp(ctx);
3017  } else {
3018  inter_pred_scaled_16bpp(ctx);
3019  }
3020  } else {
3021  if (bytesperpixel == 1) {
3022  inter_pred_8bpp(ctx);
3023  } else {
3024  inter_pred_16bpp(ctx);
3025  }
3026  }
3027  if (!b->skip) {
3028  /* mostly copied intra_recon() */
3029 
3030  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
3031  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
3032  int end_x = FFMIN(2 * (s->cols - col), w4);
3033  int end_y = FFMIN(2 * (s->rows - row), h4);
3034  int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
3035  int uvstep1d = 1 << b->uvtx, p;
3036  uint8_t *dst = s->dst[0];
3037 
3038  // y itxfm add
3039  for (n = 0, y = 0; y < end_y; y += step1d) {
3040  uint8_t *ptr = dst;
3041  for (x = 0; x < end_x; x += step1d,
3042  ptr += 4 * step1d * bytesperpixel, n += step) {
3043  int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
3044 
3045  if (eob)
3046  s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
3047  s->block + 16 * n * bytesperpixel, eob);
3048  }
3049  dst += 4 * s->y_stride * step1d;
3050  }
3051 
3052  // uv itxfm add
3053  end_x >>= s->ss_h;
3054  end_y >>= s->ss_v;
3055  step = 1 << (b->uvtx * 2);
3056  for (p = 0; p < 2; p++) {
3057  dst = s->dst[p + 1];
3058  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
3059  uint8_t *ptr = dst;
3060  for (x = 0; x < end_x; x += uvstep1d,
3061  ptr += 4 * uvstep1d * bytesperpixel, n += step) {
3062  int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
3063 
3064  if (eob)
3065  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
3066  s->uvblock[p] + 16 * n * bytesperpixel, eob);
3067  }
3068  dst += 4 * uvstep1d * s->uv_stride;
3069  }
3070  }
3071  }
3072 }
3073 
3075 {
3076  inter_recon(ctx, 1);
3077 }
3078 
3080 {
3081  inter_recon(ctx, 2);
3082 }
3083 
3084 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
3085  int row_and_7, int col_and_7,
3086  int w, int h, int col_end, int row_end,
3087  enum TxfmMode tx, int skip_inter)
3088 {
3089  static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
3090  static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
3091 
3092  // FIXME I'm pretty sure all loops can be replaced by a single LUT if
3093  // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
3094  // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
3095  // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
3096 
3097  // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
3098  // edges. This means that for UV, we work on two subsampled blocks at
3099  // a time, and we only use the topleft block's mode information to set
3100  // things like block strength. Thus, for any block size smaller than
3101  // 16x16, ignore the odd portion of the block.
3102  if (tx == TX_4X4 && (ss_v | ss_h)) {
3103  if (h == ss_v) {
3104  if (row_and_7 & 1)
3105  return;
3106  if (!row_end)
3107  h += 1;
3108  }
3109  if (w == ss_h) {
3110  if (col_and_7 & 1)
3111  return;
3112  if (!col_end)
3113  w += 1;
3114  }
3115  }
3116 
3117  if (tx == TX_4X4 && !skip_inter) {
3118  int t = 1 << col_and_7, m_col = (t << w) - t, y;
3119  // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
3120  int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
3121 
3122  for (y = row_and_7; y < h + row_and_7; y++) {
3123  int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
3124 
3125  mask[0][y][1] |= m_row_8;
3126  mask[0][y][2] |= m_row_4;
3127  // for odd lines, if the odd col is not being filtered,
3128  // skip odd row also:
3129  // .---. <-- a
3130  // | |
3131  // |___| <-- b
3132  // ^ ^
3133  // c d
3134  //
3135  // if a/c are even row/col and b/d are odd, and d is skipped,
3136  // e.g. right edge of size-66x66.webm, then skip b also (bug)
3137  if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
3138  mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
3139  } else {
3140  mask[1][y][col_mask_id] |= m_col;
3141  }
3142  if (!ss_h)
3143  mask[0][y][3] |= m_col;
3144  if (!ss_v) {
3145  if (ss_h && (col_end & 1))
3146  mask[1][y][3] |= (t << (w - 1)) - t;
3147  else
3148  mask[1][y][3] |= m_col;
3149  }
3150  }
3151  } else {
3152  int y, t = 1 << col_and_7, m_col = (t << w) - t;
3153 
3154  if (!skip_inter) {
3155  int mask_id = (tx == TX_8X8);
3156  static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
3157  int l2 = tx + ss_h - 1, step1d;
3158  int m_row = m_col & masks[l2];
3159 
3160  // at odd UV col/row edges tx16/tx32 loopfilter edges, force
3161  // 8wd loopfilter to prevent going off the visible edge.
3162  if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
3163  int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
3164  int m_row_8 = m_row - m_row_16;
3165 
3166  for (y = row_and_7; y < h + row_and_7; y++) {
3167  mask[0][y][0] |= m_row_16;
3168  mask[0][y][1] |= m_row_8;
3169  }
3170  } else {
3171  for (y = row_and_7; y < h + row_and_7; y++)
3172  mask[0][y][mask_id] |= m_row;
3173  }
3174 
3175  l2 = tx + ss_v - 1;
3176  step1d = 1 << l2;
3177  if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
3178  for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
3179  mask[1][y][0] |= m_col;
3180  if (y - row_and_7 == h - 1)
3181  mask[1][y][1] |= m_col;
3182  } else {
3183  for (y = row_and_7; y < h + row_and_7; y += step1d)
3184  mask[1][y][mask_id] |= m_col;
3185  }
3186  } else if (tx != TX_4X4) {
3187  int mask_id;
3188 
3189  mask_id = (tx == TX_8X8) || (h == ss_v);
3190  mask[1][row_and_7][mask_id] |= m_col;
3191  mask_id = (tx == TX_8X8) || (w == ss_h);
3192  for (y = row_and_7; y < h + row_and_7; y++)
3193  mask[0][y][mask_id] |= t;
3194  } else {
3195  int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
3196 
3197  for (y = row_and_7; y < h + row_and_7; y++) {
3198  mask[0][y][2] |= t4;
3199  mask[0][y][1] |= t8;
3200  }
3201  mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
3202  }
3203  }
3204 }
3205 
3206 static void decode_b(AVCodecContext *ctx, int row, int col,
3207  struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
3208  enum BlockLevel bl, enum BlockPartition bp)
3209 {
3210  VP9Context *s = ctx->priv_data;
3211  VP9Block *b = s->b;
3212  enum BlockSize bs = bl * 3 + bp;
3213  int bytesperpixel = s->bytesperpixel;
3214  int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
3215  int emu[2];
3216  AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3217 
3218  s->row = row;
3219  s->row7 = row & 7;
3220  s->col = col;
3221  s->col7 = col & 7;
3222  s->min_mv.x = -(128 + col * 64);
3223  s->min_mv.y = -(128 + row * 64);
3224  s->max_mv.x = 128 + (s->cols - col - w4) * 64;
3225  s->max_mv.y = 128 + (s->rows - row - h4) * 64;
3226  if (s->pass < 2) {
3227  b->bs = bs;
3228  b->bl = bl;
3229  b->bp = bp;
3230  decode_mode(ctx);
3231  b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
3232  (s->ss_v && h4 * 2 == (1 << b->tx)));
3233 
3234  if (!b->skip) {
3235  int has_coeffs;
3236 
3237  if (bytesperpixel == 1) {
3238  has_coeffs = decode_coeffs_8bpp(ctx);
3239  } else {
3240  has_coeffs = decode_coeffs_16bpp(ctx);
3241  }
3242  if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
3243  b->skip = 1;
3244  memset(&s->above_skip_ctx[col], 1, w4);
3245  memset(&s->left_skip_ctx[s->row7], 1, h4);
3246  }
3247  } else {
3248  int row7 = s->row7;
3249 
3250 #define SPLAT_ZERO_CTX(v, n) \
3251  switch (n) { \
3252  case 1: v = 0; break; \
3253  case 2: AV_ZERO16(&v); break; \
3254  case 4: AV_ZERO32(&v); break; \
3255  case 8: AV_ZERO64(&v); break; \
3256  case 16: AV_ZERO128(&v); break; \
3257  }
3258 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
3259  do { \
3260  SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
3261  if (s->ss_##dir2) { \
3262  SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
3263  SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
3264  } else { \
3265  SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
3266  SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
3267  } \
3268  } while (0)
3269 
3270  switch (w4) {
3271  case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
3272  case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
3273  case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
3274  case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
3275  }
3276  switch (h4) {
3277  case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
3278  case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
3279  case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
3280  case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
3281  }
3282  }
3283 
3284  if (s->pass == 1) {
3285  s->b++;
3286  s->block += w4 * h4 * 64 * bytesperpixel;
3287  s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3288  s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
3289  s->eob += 4 * w4 * h4;
3290  s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3291  s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
3292 
3293  return;
3294  }
3295  }
3296 
3297  // emulated overhangs if the stride of the target buffer can't hold. This
3298  // makes it possible to support emu-edge and so on even if we have large block
3299  // overhangs
3300  emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
3301  (row + h4) > s->rows;
3302  emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
3303  (row + h4) > s->rows;
3304  if (emu[0]) {
3305  s->dst[0] = s->tmp_y;
3306  s->y_stride = 128;
3307  } else {
3308  s->dst[0] = f->data[0] + yoff;
3309  s->y_stride = f->linesize[0];
3310  }
3311  if (emu[1]) {
3312  s->dst[1] = s->tmp_uv[0];
3313  s->dst[2] = s->tmp_uv[1];
3314  s->uv_stride = 128;
3315  } else {
3316  s->dst[1] = f->data[1] + uvoff;
3317  s->dst[2] = f->data[2] + uvoff;
3318  s->uv_stride = f->linesize[1];
3319  }
3320  if (b->intra) {
3321  if (s->s.h.bpp > 8) {
3322  intra_recon_16bpp(ctx, yoff, uvoff);
3323  } else {
3324  intra_recon_8bpp(ctx, yoff, uvoff);
3325  }
3326  } else {
3327  if (s->s.h.bpp > 8) {
3328  inter_recon_16bpp(ctx);
3329  } else {
3330  inter_recon_8bpp(ctx);
3331  }
3332  }
3333  if (emu[0]) {
3334  int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3335 
3336  for (n = 0; o < w; n++) {
3337  int bw = 64 >> n;
3338 
3339  av_assert2(n <= 4);
3340  if (w & bw) {
3341  s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
3342  s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
3343  o += bw;
3344  }
3345  }
3346  }
3347  if (emu[1]) {
3348  int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
3349  int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
3350 
3351  for (n = s->ss_h; o < w; n++) {
3352  int bw = 64 >> n;
3353 
3354  av_assert2(n <= 4);
3355  if (w & bw) {
3356  s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
3357  s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
3358  s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
3359  s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
3360  o += bw;
3361  }
3362  }
3363  }
3364 
3365  // pick filter level and find edges to apply filter to
3366  if (s->s.h.filter.level &&
3367  (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3368  [b->mode[3] != ZEROMV]) > 0) {
3369  int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3370  int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3371 
3372  setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3373  mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3374  if (s->ss_h || s->ss_v)
3375  mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
3376  s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3377  s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3378  b->uvtx, skip_inter);
3379 
3380  if (!s->filter_lut.lim_lut[lvl]) {
3381  int sharp = s->s.h.filter.sharpness;
3382  int limit = lvl;
3383 
3384  if (sharp > 0) {
3385  limit >>= (sharp + 3) >> 2;
3386  limit = FFMIN(limit, 9 - sharp);
3387  }
3388  limit = FFMAX(limit, 1);
3389 
3390  s->filter_lut.lim_lut[lvl] = limit;
3391  s->filter_lut.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3392  }
3393  }
3394 
3395  if (s->pass == 2) {
3396  s->b++;
3397  s->block += w4 * h4 * 64 * bytesperpixel;
3398  s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3399  s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
3400  s->eob += 4 * w4 * h4;
3401  s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3402  s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
3403  }
3404 }
3405 
3406 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3407  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3408 {
3409  VP9Context *s = ctx->priv_data;
3410  int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3411  (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3412  const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? vp9_default_kf_partition_probs[bl][c] :
3413  s->prob.p.partition[bl][c];
3414  enum BlockPartition bp;
3415  ptrdiff_t hbs = 4 >> bl;
3416  AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3417  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3418  int bytesperpixel = s->bytesperpixel;
3419 
3420  if (bl == BL_8X8) {
3421  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3422  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3423  } else if (col + hbs < s->cols) { // FIXME why not <=?
3424  if (row + hbs < s->rows) { // FIXME why not <=?
3425  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3426  switch (bp) {
3427  case PARTITION_NONE:
3428  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3429  break;
3430  case PARTITION_H:
3431  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3432  yoff += hbs * 8 * y_stride;
3433  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3434  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3435  break;
3436  case PARTITION_V:
3437  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3438  yoff += hbs * 8 * bytesperpixel;
3439  uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3440  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3441  break;
3442  case PARTITION_SPLIT:
3443  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3444  decode_sb(ctx, row, col + hbs, lflvl,
3445  yoff + 8 * hbs * bytesperpixel,
3446  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3447  yoff += hbs * 8 * y_stride;
3448  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3449  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3450  decode_sb(ctx, row + hbs, col + hbs, lflvl,
3451  yoff + 8 * hbs * bytesperpixel,
3452  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3453  break;
3454  default:
3455  av_assert0(0);
3456  }
3457  } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3458  bp = PARTITION_SPLIT;
3459  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3460  decode_sb(ctx, row, col + hbs, lflvl,
3461  yoff + 8 * hbs * bytesperpixel,
3462  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3463  } else {
3464  bp = PARTITION_H;
3465  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3466  }
3467  } else if (row + hbs < s->rows) { // FIXME why not <=?
3468  if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3469  bp = PARTITION_SPLIT;
3470  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3471  yoff += hbs * 8 * y_stride;
3472  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3473  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3474  } else {
3475  bp = PARTITION_V;
3476  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3477  }
3478  } else {
3479  bp = PARTITION_SPLIT;
3480  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3481  }
3482  s->counts.partition[bl][c][bp]++;
3483 }
3484 
3485 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3486  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3487 {
3488  VP9Context *s = ctx->priv_data;
3489  VP9Block *b = s->b;
3490  ptrdiff_t hbs = 4 >> bl;
3491  AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3492  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3493  int bytesperpixel = s->bytesperpixel;
3494 
3495  if (bl == BL_8X8) {
3496  av_assert2(b->bl == BL_8X8);
3497  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3498  } else if (s->b->bl == bl) {
3499  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3500  if (b->bp == PARTITION_H && row + hbs < s->rows) {
3501  yoff += hbs * 8 * y_stride;
3502  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3503  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3504  } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3505  yoff += hbs * 8 * bytesperpixel;
3506  uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3507  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3508  }
3509  } else {
3510  decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3511  if (col + hbs < s->cols) { // FIXME why not <=?
3512  if (row + hbs < s->rows) {
3513  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
3514  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3515  yoff += hbs * 8 * y_stride;
3516  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3517  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3518  decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3519  yoff + 8 * hbs * bytesperpixel,
3520  uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
3521  } else {
3522  yoff += hbs * 8 * bytesperpixel;
3523  uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
3524  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3525  }
3526  } else if (row + hbs < s->rows) {
3527  yoff += hbs * 8 * y_stride;
3528  uvoff += hbs * 8 * uv_stride >> s->ss_v;
3529  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3530  }
3531  }
3532 }
3533 
3534 static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v,
3535  uint8_t *lvl, uint8_t (*mask)[4],
3536  uint8_t *dst, ptrdiff_t ls)
3537 {
3538  int y, x, bytesperpixel = s->bytesperpixel;
3539 
3540  // filter edges between columns (e.g. block1 | block2)
3541  for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) {
3542  uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v];
3543  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3544  unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3545  unsigned hm = hm1 | hm2 | hm13 | hm23;
3546 
3547  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) {
3548  if (col || x > 1) {
3549  if (hm1 & x) {
3550  int L = *l, H = L >> 4;
3551  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3552 
3553  if (hmask1[0] & x) {
3554  if (hmask2[0] & x) {
3555  av_assert2(l[8 << ss_v] == L);
3556  s->dsp.loop_filter_16[0](ptr, ls, E, I, H);
3557  } else {
3558  s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H);
3559  }
3560  } else if (hm2 & x) {
3561  L = l[8 << ss_v];
3562  H |= (L >> 4) << 8;
3563  E |= s->filter_lut.mblim_lut[L] << 8;
3564  I |= s->filter_lut.lim_lut[L] << 8;
3565  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3566  [!!(hmask2[1] & x)]
3567  [0](ptr, ls, E, I, H);
3568  } else {
3569  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3570  [0](ptr, ls, E, I, H);
3571  }
3572  } else if (hm2 & x) {
3573  int L = l[8 << ss_v], H = L >> 4;
3574  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3575 
3576  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3577  [0](ptr + 8 * ls, ls, E, I, H);
3578  }
3579  }
3580  if (ss_h) {
3581  if (x & 0xAA)
3582  l += 2;
3583  } else {
3584  if (hm13 & x) {
3585  int L = *l, H = L >> 4;
3586  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3587 
3588  if (hm23 & x) {
3589  L = l[8 << ss_v];
3590  H |= (L >> 4) << 8;
3591  E |= s->filter_lut.mblim_lut[L] << 8;
3592  I |= s->filter_lut.lim_lut[L] << 8;
3593  s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3594  } else {
3595  s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H);
3596  }
3597  } else if (hm23 & x) {
3598  int L = l[8 << ss_v], H = L >> 4;
3599  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3600 
3601  s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H);
3602  }
3603  l++;
3604  }
3605  }
3606  }
3607 }
3608 
3609 static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v,
3610  uint8_t *lvl, uint8_t (*mask)[4],
3611  uint8_t *dst, ptrdiff_t ls)
3612 {
3613  int y, x, bytesperpixel = s->bytesperpixel;
3614 
3615  // block1
3616  // filter edges between rows (e.g. ------)
3617  // block2
3618  for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) {
3619  uint8_t *ptr = dst, *l = lvl, *vmask = mask[y];
3620  unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3621 
3622  for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) {
3623  if (row || y) {
3624  if (vm & x) {
3625  int L = *l, H = L >> 4;
3626  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3627 
3628  if (vmask[0] & x) {
3629  if (vmask[0] & (x << (1 + ss_h))) {
3630  av_assert2(l[1 + ss_h] == L);
3631  s->dsp.loop_filter_16[1](ptr, ls, E, I, H);
3632  } else {
3633  s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H);
3634  }
3635  } else if (vm & (x << (1 + ss_h))) {
3636  L = l[1 + ss_h];
3637  H |= (L >> 4) << 8;
3638  E |= s->filter_lut.mblim_lut[L] << 8;
3639  I |= s->filter_lut.lim_lut[L] << 8;
3640  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3641  [!!(vmask[1] & (x << (1 + ss_h)))]
3642  [1](ptr, ls, E, I, H);
3643  } else {
3644  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3645  [1](ptr, ls, E, I, H);
3646  }
3647  } else if (vm & (x << (1 + ss_h))) {
3648  int L = l[1 + ss_h], H = L >> 4;
3649  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3650 
3651  s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))]
3652  [1](ptr + 8 * bytesperpixel, ls, E, I, H);
3653  }
3654  }
3655  if (!ss_v) {
3656  if (vm3 & x) {
3657  int L = *l, H = L >> 4;
3658  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3659 
3660  if (vm3 & (x << (1 + ss_h))) {
3661  L = l[1 + ss_h];
3662  H |= (L >> 4) << 8;
3663  E |= s->filter_lut.mblim_lut[L] << 8;
3664  I |= s->filter_lut.lim_lut[L] << 8;
3665  s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H);
3666  } else {
3667  s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H);
3668  }
3669  } else if (vm3 & (x << (1 + ss_h))) {
3670  int L = l[1 + ss_h], H = L >> 4;
3671  int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L];
3672 
3673  s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H);
3674  }
3675  }
3676  }
3677  if (ss_v) {
3678  if (y & 1)
3679  lvl += 16;
3680  } else {
3681  lvl += 8;
3682  }
3683  }
3684 }
3685 
3686 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3687  int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3688 {
3689  VP9Context *s = ctx->priv_data;
3690  AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
3691  uint8_t *dst = f->data[0] + yoff;
3692  ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3693  uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v];
3694  int p;
3695 
3696  // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3697  // if you think of them as acting on a 8x8 block max, we can interleave
3698  // each v/h within the single x loop, but that only works if we work on
3699  // 8 pixel blocks, and we won't always do that (we want at least 16px
3700  // to use SSE2 optimizations, perhaps 32 for AVX2)
3701 
3702  filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y);
3703  filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y);
3704 
3705  for (p = 0; p < 2; p++) {
3706  dst = f->data[1 + p] + uvoff;
3707  filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv);
3708  filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv);
3709  }
3710 }
3711 
3712 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3713 {
3714  int sb_start = ( idx * n) >> log2_n;
3715  int sb_end = ((idx + 1) * n) >> log2_n;
3716  *start = FFMIN(sb_start, n) << 3;
3717  *end = FFMIN(sb_end, n) << 3;
3718 }
3719 
3720 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3721  int max_count, int update_factor)
3722 {
3723  unsigned ct = ct0 + ct1, p2, p1;
3724 
3725  if (!ct)
3726  return;
3727 
3728  update_factor = FASTDIV(update_factor * FFMIN(ct, max_count), max_count);
3729  p1 = *p;
3730  p2 = ((((int64_t) ct0) << 8) + (ct >> 1)) / ct;
3731  p2 = av_clip(p2, 1, 255);
3732 
3733  // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3734  *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3735 }
3736 
3737 static void adapt_probs(VP9Context *s)
3738 {
3739  int i, j, k, l, m;
3740  prob_context *p = &s->prob_ctx[s->s.h.framectxid].p;
3741  int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128;
3742 
3743  // coefficients
3744  for (i = 0; i < 4; i++)
3745  for (j = 0; j < 2; j++)
3746  for (k = 0; k < 2; k++)
3747  for (l = 0; l < 6; l++)
3748  for (m = 0; m < 6; m++) {
3749  uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m];
3750  unsigned *e = s->counts.eob[i][j][k][l][m];
3751  unsigned *c = s->counts.coef[i][j][k][l][m];
3752 
3753  if (l == 0 && m >= 3) // dc only has 3 pt
3754  break;
3755 
3756  adapt_prob(&pp[0], e[0], e[1], 24, uf);
3757  adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3758  adapt_prob(&pp[2], c[1], c[2], 24, uf);
3759  }
3760 
3761  if (s->s.h.keyframe || s->s.h.intraonly) {
3762  memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3763  memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3764  memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3765  memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3766  return;
3767  }
3768 
3769  // skip flag
3770  for (i = 0; i < 3; i++)
3771  adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3772 
3773  // intra/inter flag
3774  for (i = 0; i < 4; i++)
3775  adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3776 
3777  // comppred flag
3778  if (s->s.h.comppredmode == PRED_SWITCHABLE) {
3779  for (i = 0; i < 5; i++)
3780  adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3781  }
3782 
3783  // reference frames
3784  if (s->s.h.comppredmode != PRED_SINGLEREF) {
3785  for (i = 0; i < 5; i++)
3786  adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3787  s->counts.comp_ref[i][1], 20, 128);
3788  }
3789 
3790  if (s->s.h.comppredmode != PRED_COMPREF) {
3791  for (i = 0; i < 5; i++) {
3792  uint8_t *pp = p->single_ref[i];
3793  unsigned (*c)[2] = s->counts.single_ref[i];
3794 
3795  adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3796  adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3797  }
3798  }
3799 
3800  // block partitioning
3801  for (i = 0; i < 4; i++)
3802  for (j = 0; j < 4; j++) {
3803  uint8_t *pp = p->partition[i][j];
3804  unsigned *c = s->counts.partition[i][j];
3805 
3806  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3807  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3808  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3809  }
3810 
3811  // tx size
3812  if (s->s.h.txfmmode == TX_SWITCHABLE) {
3813  for (i = 0; i < 2; i++) {
3814  unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3815 
3816  adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3817  adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3818  adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3819  adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3820  adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3821  adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3822  }
3823  }
3824 
3825  // interpolation filter
3826  if (s->s.h.filtermode == FILTER_SWITCHABLE) {
3827  for (i = 0; i < 4; i++) {
3828  uint8_t *pp = p->filter[i];
3829  unsigned *c = s->counts.filter[i];
3830 
3831  adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3832  adapt_prob(&pp[1], c[1], c[2], 20, 128);
3833  }
3834  }
3835 
3836  // inter modes
3837  for (i = 0; i < 7; i++) {
3838  uint8_t *pp = p->mv_mode[i];
3839  unsigned *c = s->counts.mv_mode[i];
3840 
3841  adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3842  adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3843  adapt_prob(&pp[2], c[1], c[3], 20, 128);
3844  }
3845 
3846  // mv joints
3847  {
3848  uint8_t *pp = p->mv_joint;
3849  unsigned *c = s->counts.mv_joint;
3850 
3851  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3852  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3853  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3854  }
3855 
3856  // mv components
3857  for (i = 0; i < 2; i++) {
3858  uint8_t *pp;
3859  unsigned *c, (*c2)[2], sum;
3860 
3861  adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3862  s->counts.mv_comp[i].sign[1], 20, 128);
3863 
3864  pp = p->mv_comp[i].classes;
3865  c = s->counts.mv_comp[i].classes;
3866  sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3867  adapt_prob(&pp[0], c[0], sum, 20, 128);
3868  sum -= c[1];
3869  adapt_prob(&pp[1], c[1], sum, 20, 128);
3870  sum -= c[2] + c[3];
3871  adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3872  adapt_prob(&pp[3], c[2], c[3], 20, 128);
3873  sum -= c[4] + c[5];
3874  adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3875  adapt_prob(&pp[5], c[4], c[5], 20, 128);
3876  sum -= c[6];
3877  adapt_prob(&pp[6], c[6], sum, 20, 128);
3878  adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3879  adapt_prob(&pp[8], c[7], c[8], 20, 128);
3880  adapt_prob(&pp[9], c[9], c[10], 20, 128);
3881 
3882  adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3883  s->counts.mv_comp[i].class0[1], 20, 128);
3884  pp = p->mv_comp[i].bits;
3885  c2 = s->counts.mv_comp[i].bits;
3886  for (j = 0; j < 10; j++)
3887  adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3888 
3889  for (j = 0; j < 2; j++) {
3890  pp = p->mv_comp[i].class0_fp[j];
3891  c = s->counts.mv_comp[i].class0_fp[j];
3892  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3893  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3894  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3895  }
3896  pp = p->mv_comp[i].fp;
3897  c = s->counts.mv_comp[i].fp;
3898  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3899  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3900  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3901 
3902  if (s->s.h.highprecisionmvs) {
3903  adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3904  s->counts.mv_comp[i].class0_hp[1], 20, 128);
3905  adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3906  s->counts.mv_comp[i].hp[1], 20, 128);
3907  }
3908  }
3909 
3910  // y intra modes
3911  for (i = 0; i < 4; i++) {
3912  uint8_t *pp = p->y_mode[i];
3913  unsigned *c = s->counts.y_mode[i], sum, s2;
3914 
3915  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3916  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3917  sum -= c[TM_VP8_PRED];
3918  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3919  sum -= c[VERT_PRED];
3920  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3921  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3922  sum -= s2;
3923  adapt_prob(&pp[3], s2, sum, 20, 128);
3924  s2 -= c[HOR_PRED];
3925  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3926  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3927  sum -= c[DIAG_DOWN_LEFT_PRED];
3928  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3929  sum -= c[VERT_LEFT_PRED];
3930  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3931  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3932  }
3933 
3934  // uv intra modes
3935  for (i = 0; i < 10; i++) {
3936  uint8_t *pp = p->uv_mode[i];
3937  unsigned *c = s->counts.uv_mode[i], sum, s2;
3938 
3939  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3940  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3941  sum -= c[TM_VP8_PRED];
3942  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3943  sum -= c[VERT_PRED];
3944  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3945  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3946  sum -= s2;
3947  adapt_prob(&pp[3], s2, sum, 20, 128);
3948  s2 -= c[HOR_PRED];
3949  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3950  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3951  sum -= c[DIAG_DOWN_LEFT_PRED];
3952  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3953  sum -= c[VERT_LEFT_PRED];
3954  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3955  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3956  }
3957 }
3958 
3959 static void free_buffers(VP9Context *s)
3960 {
3961  av_freep(&s->intra_pred_data[0]);
3962  av_freep(&s->b_base);
3963  av_freep(&s->block_base);
3964 }
3965 
3967 {
3968  VP9Context *s = ctx->priv_data;
3969  int i;
3970 
3971  for (i = 0; i < 3; i++) {
3972  if (s->s.frames[i].tf.f->buf[0])
3973  vp9_unref_frame(ctx, &s->s.frames[i]);
3974  av_frame_free(&s->s.frames[i].tf.f);
3975  }
3976  for (i = 0; i < 8; i++) {
3977  if (s->s.refs[i].f->buf[0])
3978  ff_thread_release_buffer(ctx, &s->s.refs[i]);
3979  av_frame_free(&s->s.refs[i].f);
3980  if (s->next_refs[i].f->buf[0])
3981  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3982  av_frame_free(&s->next_refs[i].f);
3983  }
3984  free_buffers(s);
3985  av_freep(&s->c_b);
3986  s->c_b_size = 0;
3987 
3988  return 0;
3989 }
3990 
3991 
3993  int *got_frame, AVPacket *pkt)
3994 {
3995  const uint8_t *data = pkt->data;
3996  int size = pkt->size;
3997  VP9Context *s = ctx->priv_data;
3998  int res, tile_row, tile_col, i, ref, row, col;
3999  int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
4001  ptrdiff_t yoff, uvoff, ls_y, ls_uv;
4002  AVFrame *f;
4003  int bytesperpixel;
4004 
4005  if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
4006  return res;
4007  } else if (res == 0) {
4008  if (!s->s.refs[ref].f->buf[0]) {
4009  av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
4010  return AVERROR_INVALIDDATA;
4011  }
4012  if ((res = av_frame_ref(frame, s->s.refs[ref].f)) < 0)
4013  return res;
4014  ((AVFrame *)frame)->pts = pkt->pts;
4015 #if FF_API_PKT_PTS
4017  ((AVFrame *)frame)->pkt_pts = pkt->pts;
4019 #endif
4020  ((AVFrame *)frame)->pkt_dts = pkt->dts;
4021  for (i = 0; i < 8; i++) {
4022  if (s->next_refs[i].f->buf[0])
4023  ff_thread_release_buffer(ctx, &s->next_refs[i]);
4024  if (s->s.refs[i].f->buf[0] &&
4025  (res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0)
4026  return res;
4027  }
4028  *got_frame = 1;
4029  return pkt->size;
4030  }
4031  data += res;
4032  size -= res;
4033 
4034  if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) {
4035  if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0])
4037  if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4038  (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0)
4039  return res;
4040  }
4041  if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0])
4043  if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] &&
4044  (res = vp9_ref_frame(ctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0)
4045  return res;
4046  if (s->s.frames[CUR_FRAME].tf.f->buf[0])
4047  vp9_unref_frame(ctx, &s->s.frames[CUR_FRAME]);
4048  if ((res = vp9_alloc_frame(ctx, &s->s.frames[CUR_FRAME])) < 0)
4049  return res;
4050  f = s->s.frames[CUR_FRAME].tf.f;
4051  f->key_frame = s->s.h.keyframe;
4053  ls_y = f->linesize[0];
4054  ls_uv =f->linesize[1];
4055 
4056  if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
4060  }
4061 
4062  // ref frame setup
4063  for (i = 0; i < 8; i++) {
4064  if (s->next_refs[i].f->buf[0])
4065  ff_thread_release_buffer(ctx, &s->next_refs[i]);
4066  if (s->s.h.refreshrefmask & (1 << i)) {
4067  res = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf);
4068  } else if (s->s.refs[i].f->buf[0]) {
4069  res = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]);
4070  }
4071  if (res < 0)
4072  return res;
4073  }
4074 
4075  if (ctx->hwaccel) {
4076  res = ctx->hwaccel->start_frame(ctx, NULL, 0);
4077  if (res < 0)
4078  return res;
4079  res = ctx->hwaccel->decode_slice(ctx, pkt->data, pkt->size);
4080  if (res < 0)
4081  return res;
4082  res = ctx->hwaccel->end_frame(ctx);
4083  if (res < 0)
4084  return res;
4085  goto finish;
4086  }
4087 
4088  // main tile decode loop
4089  bytesperpixel = s->bytesperpixel;
4090  memset(s->above_partition_ctx, 0, s->cols);
4091  memset(s->above_skip_ctx, 0, s->cols);
4092  if (s->s.h.keyframe || s->s.h.intraonly) {
4093  memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
4094  } else {
4095  memset(s->above_mode_ctx, NEARESTMV, s->cols);
4096  }
4097  memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
4098  memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h);
4099  memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h);
4100  memset(s->above_segpred_ctx, 0, s->cols);
4101  s->pass = s->s.frames[CUR_FRAME].uses_2pass =
4103  if ((res = update_block_buffers(ctx)) < 0) {
4104  av_log(ctx, AV_LOG_ERROR,
4105  "Failed to allocate block buffers\n");
4106  return res;
4107  }
4108  if (s->s.h.refreshctx && s->s.h.parallelmode) {
4109  int j, k, l, m;
4110 
4111  for (i = 0; i < 4; i++) {