[FFmpeg-devel] [PATCH] Use larger tables for yuv > 8 bit to RGB conversion.

Sat Nov 9 20:56:37 CET 2013

This should allow for fairly precise YUV16 to RGB48 conversion
for example.
However I believe that this specific implementation is not as accurate
as it could/should be, i.e. the table generation might be buggy.
In addition it make the scaled yuv->rgb slightly slower, though also
more precise for 9 and 10 bit.
Still not sure it is a good idea.
---
 libswscale/output.c           |  44 +++++-----
 libswscale/swscale_internal.h |  11 +--
 libswscale/swscale_unscaled.c |   1 -
 libswscale/yuv2rgb.c          | 195 ++++++++++++++++++++----------------------
 libswscale/yuv2rgb_template.c |  28 +++---
 5 files changed, 135 insertions(+), 144 deletions(-)

diff --git a/libswscale/output.c b/libswscale/output.c
index ddb0d0c..d862510 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -1254,13 +1254,14 @@ yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
                      int y, enum AVPixelFormat target, int hasAlpha)
 {
     int i;
+    int shift = 27 - c->yuvtable_bits;
 
     for (i = 0; i < ((dstW + 1) >> 1); i++) {
         int j, A1, A2;
-        int Y1 = 1 << 18;
-        int Y2 = 1 << 18;
-        int U  = 1 << 18;
-        int V  = 1 << 18;
+        int Y1 = 1 << (shift - 1);
+        int Y2 = 1 << (shift - 1);
+        int U  = 1 << (shift - 1);
+        int V  = 1 << (shift - 1);
         const void *r, *g, *b;
 
         for (j = 0; j < lumFilterSize; j++) {
@@ -1271,10 +1272,10 @@ yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
             U += chrUSrc[j][i] * chrFilter[j];
             V += chrVSrc[j][i] * chrFilter[j];
         }
-        Y1 >>= 19;
-        Y2 >>= 19;
-        U  >>= 19;
-        V  >>= 19;
+        Y1 >>= shift;
+        Y2 >>= shift;
+        U  >>= shift;
+        V  >>= shift;
         if (hasAlpha) {
             A1 = 1 << 18;
             A2 = 1 << 18;
@@ -1306,6 +1307,7 @@ yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
                      int yalpha, int uvalpha, int y,
                      enum AVPixelFormat target, int hasAlpha)
 {
+    int shift = 27 - c->yuvtable_bits;
     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
@@ -1316,10 +1318,10 @@ yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
     int i;
 
     for (i = 0; i < ((dstW + 1) >> 1); i++) {
-        int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
-        int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
-        int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
-        int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
+        int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> shift;
+        int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> shift;
+        int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> shift;
+        int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> shift;
         int A1, A2;
         const void *r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
                    *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
@@ -1344,15 +1346,17 @@ yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
                      int uvalpha, int y, enum AVPixelFormat target,
                      int hasAlpha)
 {
+    int shift = 15 - c->yuvtable_bits;
+    int round = 1 << (shift - 1);
     const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
     int i;
 
     if (uvalpha < 2048) {
         for (i = 0; i < ((dstW + 1) >> 1); i++) {
-            int Y1 = (buf0[i * 2    ] + 64) >> 7;
-            int Y2 = (buf0[i * 2 + 1] + 64) >> 7;
-            int U  = (ubuf0[i]        + 64) >> 7;
-            int V  = (vbuf0[i]        + 64) >> 7;
+            int Y1 = (buf0[i * 2    ] + round) >> shift;
+            int Y2 = (buf0[i * 2 + 1] + round) >> shift;
+            int U  = (ubuf0[i]        + round) >> shift;
+            int V  = (vbuf0[i]        + round) >> shift;
             int A1, A2;
             const void *r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
                        *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
@@ -1371,10 +1375,10 @@ yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
     } else {
         const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
         for (i = 0; i < ((dstW + 1) >> 1); i++) {
-            int Y1 = (buf0[i * 2    ]     +  64) >> 7;
-            int Y2 = (buf0[i * 2 + 1]     +  64) >> 7;
-            int U  = (ubuf0[i] + ubuf1[i] + 128) >> 8;
-            int V  = (vbuf0[i] + vbuf1[i] + 128) >> 8;
+            int Y1 = (buf0[i * 2    ]     + round) >> shift;
+            int Y2 = (buf0[i * 2 + 1]     + round) >> shift;
+            int U  = (ubuf0[i] + ubuf1[i] + 2*round) >> (shift + 1);
+            int V  = (vbuf0[i] + vbuf1[i] + 2*round) >> (shift + 1);
             int A1, A2;
             const void *r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
                        *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 6ad278e..2a93f6f 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -37,7 +37,7 @@
 
 #define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
 
-#define YUVRGB_TABLE_HEADROOM 128
+#define YUVRGB_TABLE_HEADROOM 512
 
 #define MAX_FILTER_SIZE 256
 
@@ -362,12 +362,13 @@ typedef struct SwsContext {
     int dstY;                     ///< Last destination vertical line output from last slice.
     int flags;                    ///< Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
     void *yuvTable;             // pointer to the yuv->rgb table start so it can be freed()
+    int yuvtable_bits;
     // alignment ensures the offset can be added in a single
     // instruction on e.g. ARM
-    DECLARE_ALIGNED(16, int, table_gV)[256 + 2*YUVRGB_TABLE_HEADROOM];
-    uint8_t *table_rV[256 + 2*YUVRGB_TABLE_HEADROOM];
-    uint8_t *table_gU[256 + 2*YUVRGB_TABLE_HEADROOM];
-    uint8_t *table_bU[256 + 2*YUVRGB_TABLE_HEADROOM];
+    DECLARE_ALIGNED(16, int, table_gV)[1024 + 2*YUVRGB_TABLE_HEADROOM];
+    uint8_t *table_rV[1024 + 2*YUVRGB_TABLE_HEADROOM];
+    uint8_t *table_gU[1024 + 2*YUVRGB_TABLE_HEADROOM];
+    uint8_t *table_bU[1024 + 2*YUVRGB_TABLE_HEADROOM];
     DECLARE_ALIGNED(16, int32_t, input_rgb2yuv_table)[16+40*4]; // This table can contain both C and SIMD formatted values, teh C vales are always at the XY_IDX points
 #define RY_IDX 0
 #define GY_IDX 1
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 8842f35..e96b12d 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -1219,7 +1219,6 @@ void ff_get_unscaled_swscale(SwsContext *c)
     if ((srcFormat == AV_PIX_FMT_YUV420P || srcFormat == AV_PIX_FMT_YUV422P ||
          srcFormat == AV_PIX_FMT_YUV420P9 || srcFormat == AV_PIX_FMT_YUV422P9 ||
          srcFormat == AV_PIX_FMT_YUV420P10 || srcFormat == AV_PIX_FMT_YUV422P10 ||
-         srcFormat == AV_PIX_FMT_YUV420P16 || srcFormat == AV_PIX_FMT_YUV422P16 ||
          srcFormat == AV_PIX_FMT_YUVA420P) && isAnyRGB(dstFormat) &&
         !(flags & SWS_ACCURATE_RND) && (c->dither == SWS_DITHER_BAYER || c->dither == SWS_DITHER_AUTO) && !(dstH & 1)) {
         c->swscale = ff_yuv2rgb_get_func_ptr(c);
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index 28de37e..1d23aca 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -54,60 +54,60 @@ const int *sws_getCoefficients(int colorspace)
 }
 
 #define LOADCHROMA(i)                               \
-    U = pu[i] >> shift;                             \
-    V = pv[i] >> shift;                             \
-    r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM];                     \
-    g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] + c->table_gV[V+YUVRGB_TABLE_HEADROOM]);  \
+    U = pu[i];                                      \
+    V = pv[i];                                      \
+    r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM]; \
+    g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] + c->table_gV[V+YUVRGB_TABLE_HEADROOM]); \
     b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];
 
 #define PUTRGB(dst, src, i)                         \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[2 * i]     = r[Y] + g[Y] + b[Y];            \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[2 * i + 1] = r[Y] + g[Y] + b[Y];
 
 #define PUTRGB24(dst, src, i)                       \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[6 * i + 0] = r[Y];                          \
     dst[6 * i + 1] = g[Y];                          \
     dst[6 * i + 2] = b[Y];                          \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[6 * i + 3] = r[Y];                          \
     dst[6 * i + 4] = g[Y];                          \
     dst[6 * i + 5] = b[Y];
 
 #define PUTBGR24(dst, src, i)                       \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[6 * i + 0] = b[Y];                          \
     dst[6 * i + 1] = g[Y];                          \
     dst[6 * i + 2] = r[Y];                          \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[6 * i + 3] = b[Y];                          \
     dst[6 * i + 4] = g[Y];                          \
     dst[6 * i + 5] = r[Y];
 
 #define PUTRGBA(dst, ysrc, asrc, i, s)                                  \
-    Y              = ysrc[2 * i] >> shift;                              \
-    dst[2 * i]     = r[Y] + g[Y] + b[Y] + (asrc[2 * i]     >> shift << s); \
-    Y              = ysrc[2 * i + 1] >> shift;                          \
-    dst[2 * i + 1] = r[Y] + g[Y] + b[Y] + (asrc[2 * i + 1] >> shift << s);
+    Y              = ysrc[2 * i];                                       \
+    dst[2 * i]     = r[Y] + g[Y] + b[Y] + (asrc[2 * i]     << s);       \
+    Y              = ysrc[2 * i + 1];                                   \
+    dst[2 * i + 1] = r[Y] + g[Y] + b[Y] + (asrc[2 * i + 1] << s);
 
 #define PUTRGB48(dst, src, i)                       \
-    Y                = src[ 2 * i] >> shift;        \
+    Y                = src[ 2 * i];                 \
     dst[12 * i +  0] = dst[12 * i +  1] = r[Y];     \
     dst[12 * i +  2] = dst[12 * i +  3] = g[Y];     \
     dst[12 * i +  4] = dst[12 * i +  5] = b[Y];     \
-    Y                = src[ 2 * i + 1] >> shift;    \
+    Y                = src[ 2 * i + 1];             \
     dst[12 * i +  6] = dst[12 * i +  7] = r[Y];     \
     dst[12 * i +  8] = dst[12 * i +  9] = g[Y];     \
     dst[12 * i + 10] = dst[12 * i + 11] = b[Y];
 
 #define PUTBGR48(dst, src, i)                       \
-    Y                = src[2 * i] >> shift;         \
+    Y                = src[2 * i];                  \
     dst[12 * i +  0] = dst[12 * i +  1] = b[Y];     \
     dst[12 * i +  2] = dst[12 * i +  3] = g[Y];     \
     dst[12 * i +  4] = dst[12 * i +  5] = r[Y];     \
-    Y                = src[2  * i +  1] >> shift;   \
+    Y                = src[2  * i +  1];            \
     dst[12 * i +  6] = dst[12 * i +  7] = b[Y];     \
     dst[12 * i +  8] = dst[12 * i +  9] = g[Y];     \
     dst[12 * i + 10] = dst[12 * i + 11] = r[Y];
@@ -164,35 +164,15 @@ const int *sws_getCoefficients(int colorspace)
     ENDYUV2RGBFUNC()
 
 #define src_type const uint8_t
-#define shift 0
 #define suffix(a) a
 #include "yuv2rgb_template.c"
 #undef src_type
-#undef shift
 #undef suffix
 
 #define src_type const uint16_t
-#define shift 1
-#define suffix(a) a##9
-#include "yuv2rgb_template.c"
-#undef src_type
-#undef shift
-#undef suffix
-
-#define src_type const uint16_t
-#define shift 2
-#define suffix(a) a##10
-#include "yuv2rgb_template.c"
-#undef src_type
-#undef shift
-#undef suffix
-
-#define src_type const uint16_t
-#define shift 8
 #define suffix(a) a##16
 #include "yuv2rgb_template.c"
 #undef src_type
-#undef shift
 #undef suffix
 
 SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
@@ -216,8 +196,7 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
            "No accelerated colorspace conversion found from %s to %s.\n",
            av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
 
-#define SELECT(n) \
-    (bits == 16 ? n##16 : bits == 10 ? n##10 : bits == 9 ? n##9 : n)
+#define SELECT(n) (bits > 8 ? n##16 : n)
 
     switch (c->dstFormat) {
     case AV_PIX_FMT_BGR48BE:
@@ -261,27 +240,29 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
     return NULL;
 }
 
-static void fill_table(uint8_t* table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize,
+static void fill_table(uint8_t **table, int bits, const int elemsize,
                        const int64_t inc, void *y_tab)
 {
     int i;
     uint8_t *y_table = y_tab;
+    int count = 1 << bits;
 
-    y_table -= elemsize * (inc >> 9);
+    y_table -= elemsize * (inc >> (17 - bits));
 
-    for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
-        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
+    for (i = 0; i < count + 2*YUVRGB_TABLE_HEADROOM; i++) {
+        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, count-1)*inc;
         table[i] = y_table + elemsize * (cb >> 16);
     }
 }
 
-static void fill_gv_table(int table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize, const int64_t inc)
+static void fill_gv_table(int *table, int bits, const int elemsize, const int64_t inc)
 {
     int i;
-    int off    = -(inc >> 9);
+    int off    = -(inc >> (17 - bits));
+    int count = 1 << bits;
 
-    for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
-        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
+    for (i = 0; i < count + 2*YUVRGB_TABLE_HEADROOM; i++) {
+        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, count-1)*inc;
         table[i] = elemsize * (off + (cb >> 16));
     }
 }
@@ -302,6 +283,8 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
                                      int fullRange, int brightness,
                                      int contrast, int saturation)
 {
+    int bits = av_clip(av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1 + 1, 8, 10);
+    int table_scale = (1 << bits) >> 8;
     const int isRgb = c->dstFormat == AV_PIX_FMT_RGB32     ||
                       c->dstFormat == AV_PIX_FMT_RGB32_1   ||
                       c->dstFormat == AV_PIX_FMT_BGR24     ||
@@ -326,7 +309,7 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
     uint16_t *y_table16;
     uint32_t *y_table32;
     int i, base, rbase, gbase, bbase, av_uninit(abase), needAlpha;
-    const int yoffs = fullRange ? 384 : 326;
+    int yoffs = fullRange ? 384 : 326;
 
     int64_t crv =  inv_table[0];
     int64_t cbu =  inv_table[1];
@@ -375,117 +358,121 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
     cgu = ((cgu << 16) + 0x8000) / FFMAX(cy, 1);
     cgv = ((cgv << 16) + 0x8000) / FFMAX(cy, 1);
 
+    yoffs *= table_scale;
+    cy    /= table_scale;
+
     av_freep(&c->yuvTable);
+    c->yuvtable_bits = bits;
 
     switch (bpp) {
     case 1:
-        c->yuvTable = av_malloc(1024);
+        c->yuvTable = av_malloc(table_scale * 1024);
         y_table     = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024 - 110; i++) {
-            y_table[i + 110]  = av_clip_uint8((yb + 0x8000) >> 16) >> 7;
+        for (i = 0; i < table_scale * (1024 - 110); i++) {
+            y_table[i + table_scale * 110]  = av_clip_uint8((yb + 0x8000) >> 16) >> 7;
             yb               += cy;
         }
-        fill_table(c->table_gU, 1, cgu, y_table + yoffs);
-        fill_gv_table(c->table_gV, 1, cgv);
+        fill_table(c->table_gU, bits, 1, cgu, y_table + yoffs);
+        fill_gv_table(c->table_gV, bits, 1, cgv);
         break;
     case 4:
     case 4 | 128:
         rbase       = isRgb ? 3 : 0;
         gbase       = 1;
         bbase       = isRgb ? 0 : 3;
-        c->yuvTable = av_malloc(1024 * 3);
+        c->yuvTable = av_malloc(table_scale * 1024 * 3);
         y_table     = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024 - 110; i++) {
+        for (i = 0; i < table_scale * (1024 - 110); i++) {
             int yval                = av_clip_uint8((yb + 0x8000) >> 16);
-            y_table[i + 110]        = (yval >> 7)        << rbase;
-            y_table[i +  37 + 1024] = ((yval + 43) / 85) << gbase;
-            y_table[i + 110 + 2048] = (yval >> 7)        << bbase;
+            y_table[i + table_scale * 110]        = (yval >> 7)        << rbase;
+            y_table[i + table_scale *  37 + table_scale * 1024] = ((yval + 43) / 85) << gbase;
+            y_table[i + table_scale * 110 + table_scale * 2048] = (yval >> 7)        << bbase;
             yb += cy;
         }
-        fill_table(c->table_rV, 1, crv, y_table + yoffs);
-        fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
-        fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
-        fill_gv_table(c->table_gV, 1, cgv);
+        fill_table(c->table_rV, bits, 1, crv, y_table + yoffs);
+        fill_table(c->table_gU, bits, 1, cgu, y_table + yoffs + table_scale * 1024);
+        fill_table(c->table_bU, bits, 1, cbu, y_table + yoffs + table_scale * 2048);
+        fill_gv_table(c->table_gV, bits, 1, cgv);
         break;
     case 8:
         rbase       = isRgb ? 5 : 0;
         gbase       = isRgb ? 2 : 3;
         bbase       = isRgb ? 0 : 6;
-        c->yuvTable = av_malloc(1024 * 3);
+        c->yuvTable = av_malloc(table_scale * 1024 * 3);
         y_table     = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024 - 38; i++) {
+        for (i = 0; i < table_scale * 1024 - table_scale * 38; i++) {
             int yval               = av_clip_uint8((yb + 0x8000) >> 16);
-            y_table[i + 16]        = ((yval + 18) / 36) << rbase;
-            y_table[i + 16 + 1024] = ((yval + 18) / 36) << gbase;
-            y_table[i + 37 + 2048] = ((yval + 43) / 85) << bbase;
+            y_table[i + table_scale * 16]        = ((yval + 18) / 36) << rbase;
+            y_table[i + table_scale * 16 + table_scale * 1024] = ((yval + 18) / 36) << gbase;
+            y_table[i + table_scale * 37 + table_scale * 2048] = ((yval + 43) / 85) << bbase;
             yb += cy;
         }
-        fill_table(c->table_rV, 1, crv, y_table + yoffs);
-        fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
-        fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
-        fill_gv_table(c->table_gV, 1, cgv);
+        fill_table(c->table_rV, bits, 1, crv, y_table + yoffs);
+        fill_table(c->table_gU, bits, 1, cgu, y_table + yoffs + table_scale * 1024);
+        fill_table(c->table_bU, bits, 1, cbu, y_table + yoffs + table_scale * 2048);
+        fill_gv_table(c->table_gV, bits, 1, cgv);
         break;
     case 12:
         rbase       = isRgb ? 8 : 0;
         gbase       = 4;
         bbase       = isRgb ? 0 : 8;
-        c->yuvTable = av_malloc(1024 * 3 * 2);
+        c->yuvTable = av_malloc(table_scale * 1024 * 3 * 2);
         y_table16   = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024; i++) {
+        for (i = 0; i < table_scale * 1024; i++) {
             uint8_t yval        = av_clip_uint8((yb + 0x8000) >> 16);
             y_table16[i]        = (yval >> 4) << rbase;
-            y_table16[i + 1024] = (yval >> 4) << gbase;
-            y_table16[i + 2048] = (yval >> 4) << bbase;
+            y_table16[i + table_scale * 1024] = (yval >> 4) << gbase;
+            y_table16[i + table_scale * 2048] = (yval >> 4) << bbase;
             yb += cy;
         }
         if (isNotNe)
-            for (i = 0; i < 1024 * 3; i++)
+            for (i = 0; i < table_scale * 1024 * 3; i++)
                 y_table16[i] = av_bswap16(y_table16[i]);
-        fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
-        fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + 1024);
-        fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2048);
-        fill_gv_table(c->table_gV, 2, cgv);
+        fill_table(c->table_rV, bits, 2, crv, y_table16 + yoffs);
+        fill_table(c->table_gU, bits, 2, cgu, y_table16 + yoffs + table_scale * 1024);
+        fill_table(c->table_bU, bits, 2, cbu, y_table16 + yoffs + table_scale * 2048);
+        fill_gv_table(c->table_gV, bits, 2, cgv);
         break;
     case 15:
     case 16:
         rbase       = isRgb ? bpp - 5 : 0;
         gbase       = 5;
         bbase       = isRgb ? 0 : (bpp - 5);
-        c->yuvTable = av_malloc(1024 * 3 * 2);
+        c->yuvTable = av_malloc(table_scale * 1024 * 3 * 2);
         y_table16   = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024; i++) {
+        for (i = 0; i < table_scale * 1024; i++) {
             uint8_t yval        = av_clip_uint8((yb + 0x8000) >> 16);
             y_table16[i]        = (yval >> 3)          << rbase;
-            y_table16[i + 1024] = (yval >> (18 - bpp)) << gbase;
-            y_table16[i + 2048] = (yval >> 3)          << bbase;
+            y_table16[i + table_scale * 1024] = (yval >> (18 - bpp)) << gbase;
+            y_table16[i + table_scale * 2048] = (yval >> 3)          << bbase;
             yb += cy;
         }
         if (isNotNe)
-            for (i = 0; i < 1024 * 3; i++)
+            for (i = 0; i < table_scale * 1024 * 3; i++)
                 y_table16[i] = av_bswap16(y_table16[i]);
-        fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
-        fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + 1024);
-        fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2048);
-        fill_gv_table(c->table_gV, 2, cgv);
+        fill_table(c->table_rV, bits, 2, crv, y_table16 + yoffs);
+        fill_table(c->table_gU, bits, 2, cgu, y_table16 + yoffs + table_scale * 1024);
+        fill_table(c->table_bU, bits, 2, cbu, y_table16 + yoffs + table_scale * 2048);
+        fill_gv_table(c->table_gV, bits, 2, cgv);
         break;
     case 24:
     case 48:
-        c->yuvTable = av_malloc(1024);
+        c->yuvTable = av_malloc(table_scale * 1024);
         y_table     = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024; i++) {
+        for (i = 0; i < table_scale * 1024; i++) {
             y_table[i]  = av_clip_uint8((yb + 0x8000) >> 16);
             yb         += cy;
         }
-        fill_table(c->table_rV, 1, crv, y_table + yoffs);
-        fill_table(c->table_gU, 1, cgu, y_table + yoffs);
-        fill_table(c->table_bU, 1, cbu, y_table + yoffs);
-        fill_gv_table(c->table_gV, 1, cgv);
+        fill_table(c->table_rV, bits, 1, crv, y_table + yoffs);
+        fill_table(c->table_gU, bits, 1, cgu, y_table + yoffs);
+        fill_table(c->table_bU, bits, 1, cbu, y_table + yoffs);
+        fill_gv_table(c->table_gV, bits, 1, cgv);
         break;
     case 32:
     case 64:
@@ -497,21 +484,21 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
         needAlpha = CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat);
         if (!needAlpha)
             abase = (base + 24) & 31;
-        c->yuvTable = av_malloc(1024 * 3 * 4);
+        c->yuvTable = av_malloc(table_scale * 1024 * 3 * 4);
         y_table32   = c->yuvTable;
         yb = -(384 << 16) - oy;
-        for (i = 0; i < 1024; i++) {
+        for (i = 0; i < table_scale * 1024; i++) {
             unsigned yval       = av_clip_uint8((yb + 0x8000) >> 16);
             y_table32[i]        = (yval << rbase) +
                                   (needAlpha ? 0 : (255u << abase));
-            y_table32[i + 1024] =  yval << gbase;
-            y_table32[i + 2048] =  yval << bbase;
+            y_table32[i + table_scale * 1024] =  yval << gbase;
+            y_table32[i + table_scale * 2048] =  yval << bbase;
             yb += cy;
         }
-        fill_table(c->table_rV, 4, crv, y_table32 + yoffs);
-        fill_table(c->table_gU, 4, cgu, y_table32 + yoffs + 1024);
-        fill_table(c->table_bU, 4, cbu, y_table32 + yoffs + 2048);
-        fill_gv_table(c->table_gV, 4, cgv);
+        fill_table(c->table_rV, bits, 4, crv, y_table32 + yoffs);
+        fill_table(c->table_gU, bits, 4, cgu, y_table32 + yoffs + table_scale * 1024);
+        fill_table(c->table_bU, bits, 4, cbu, y_table32 + yoffs + table_scale * 2048);
+        fill_gv_table(c->table_gV, bits, 4, cgv);
         break;
     default:
         if(!isPlanar(c->dstFormat) || bpp <= 24)
diff --git a/libswscale/yuv2rgb_template.c b/libswscale/yuv2rgb_template.c
index e3ca8ba..0fe9cde 100644
--- a/libswscale/yuv2rgb_template.c
+++ b/libswscale/yuv2rgb_template.c
@@ -248,11 +248,11 @@ YUV2RGBFUNC(yuv2rgb_c_16_ordered_dither, uint16_t, 0)
     const uint8_t *f16 = ff_dither_2x2_8[(y & 1)^1];
 
 #define PUTRGB16(dst, src, i, o)                    \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[2 * i]     = r[Y + d16[0 + o]] +            \
                      g[Y + e16[0 + o]] +            \
                      b[Y + f16[0 + o]];             \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[2 * i + 1] = r[Y + d16[1 + o]] +            \
                      g[Y + e16[1 + o]] +            \
                      b[Y + f16[1 + o]];
@@ -278,11 +278,11 @@ YUV2RGBFUNC(yuv2rgb_c_15_ordered_dither, uint16_t, 0)
     const uint8_t *e16 = ff_dither_2x2_8[(y & 1)^1];
 
 #define PUTRGB15(dst, src, i, o)                    \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[2 * i]     = r[Y + d16[0 + o]] +            \
                      g[Y + d16[1 + o]] +            \
                      b[Y + e16[0 + o]];             \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[2 * i + 1] = r[Y + d16[1 + o]] +            \
                      g[Y + d16[0 + o]] +            \
                      b[Y + e16[1 + o]];
@@ -308,11 +308,11 @@ YUV2RGBFUNC(yuv2rgb_c_12_ordered_dither, uint16_t, 0)
     const uint8_t *d16 = ff_dither_4x4_16[y & 3];
 
 #define PUTRGB12(dst, src, i, o)                    \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[2 * i]     = r[Y + d16[0 + o]] +            \
                      g[Y + d16[0 + o]] +            \
                      b[Y + d16[0 + o]];             \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[2 * i + 1] = r[Y + d16[1 + o]] +            \
                      g[Y + d16[1 + o]] +            \
                      b[Y + d16[1 + o]];
@@ -340,11 +340,11 @@ YUV2RGBFUNC(yuv2rgb_c_8_ordered_dither, uint8_t, 0)
     const uint8_t *d64 = ff_dither_8x8_73[y & 7];
 
 #define PUTRGB8(dst, src, i, o)                     \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[2 * i]     = r[Y + d32[0 + o]] +            \
                      g[Y + d32[0 + o]] +            \
                      b[Y + d64[0 + o]];             \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[2 * i + 1] = r[Y + d32[1 + o]] +            \
                      g[Y + d32[1 + o]] +            \
                      b[Y + d64[1 + o]];
@@ -372,11 +372,11 @@ YUV2RGBFUNC(yuv2rgb_c_4_ordered_dither, uint8_t, 0)
     int acc;
 
 #define PUTRGB4D(dst, src, i, o)                    \
-    Y      = src[2 * i] >> shift;                   \
+    Y      = src[2 * i];                            \
     acc    = r[Y + d128[0 + o]] +                   \
              g[Y +  d64[0 + o]] +                   \
              b[Y + d128[0 + o]];                    \
-    Y      = src[2 * i + 1] >> shift;               \
+    Y      = src[2 * i + 1];                        \
     acc   |= (r[Y + d128[1 + o]] +                  \
               g[Y +  d64[1 + o]] +                  \
               b[Y + d128[1 + o]]) << 4;             \
@@ -404,11 +404,11 @@ YUV2RGBFUNC(yuv2rgb_c_4b_ordered_dither, uint8_t, 0)
     const uint8_t *d128 = ff_dither_8x8_220[y & 7];
 
 #define PUTRGB4DB(dst, src, i, o)                   \
-    Y              = src[2 * i] >> shift;           \
+    Y              = src[2 * i];                    \
     dst[2 * i]     = r[Y + d128[0 + o]] +           \
                      g[Y +  d64[0 + o]] +           \
                      b[Y + d128[0 + o]];            \
-    Y              = src[2 * i + 1] >> shift;       \
+    Y              = src[2 * i + 1];                \
     dst[2 * i + 1] = r[Y + d128[1 + o]] +           \
                      g[Y +  d64[1 + o]] +           \
                      b[Y + d128[1 + o]];
@@ -436,9 +436,9 @@ YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t, 0)
     g = c->table_gU[128 + YUVRGB_TABLE_HEADROOM] + c->table_gV[128 + YUVRGB_TABLE_HEADROOM];
 
 #define PUTRGB1(out, src, i, o)                     \
-    Y    = src[2 * i] >> shift;                     \
+    Y    = src[2 * i];                              \
     out += out + g[Y + d128[0 + o]];                \
-    Y    = src[2 * i + 1] >> shift;                 \
+    Y    = src[2 * i + 1];                          \
     out += out + g[Y + d128[1 + o]];
 
     PUTRGB1(out_1, py_1, 0, 0);
-- 
1.8.4.2