[FFmpeg-devel] [PATCH] Fix unaligned fill_rectangle in rv34.c

Thu Aug 27 21:57:56 CEST 2009

On Sat, Aug 22, 2009 at 04:11:19PM +0200, Michael Niedermayer wrote:
> let me try again
> if you write a rectangle of 8xC then there exists a semantic partitioning
> of sizeof=8, and things could be aligned accordingly, if not you cannot
> use fill_rectangle()
> you would have to replace r->avail_cache + 5 by + 6 maybe and make other
> related changes, its perfectly fine as well if you dont and remove all
> calls to fill_rectangle() from your code but fill_rectangle() requires
> aligned memory.

So can we get over with this and use attached patch?
The important cases needs something far simpler than fill_rectangle,
so it is not that bad code-wise I think.
-------------- next part --------------
Index: libavcodec/rv34.c
===================================================================

--- libavcodec/rv34.c	(revision 19710)
+++ libavcodec/rv34.c	(working copy)
@@ -238,6 +238,29 @@
 
 /** @} */ // transform
 
+/**
+ * fills a square block with a given uint32_t value
+ * @param dst start of block to fill
+ * @param size width and height of block, must be a either 2 or 4
+ * @param stride line stride to use
+ * @param value value to fill with
+ */
+static av_always_inline void fill_uint32_block(uint32_t *dst, int size, int stride, uint32_t value)
+{
+    if (size == 4) {
+        dst[0 * stride + 0] = dst[0 * stride + 1] =
+        dst[0 * stride + 0] = dst[0 * stride + 1] =
+        dst[1 * stride + 0] = dst[1 * stride + 1] =
+        dst[1 * stride + 2] = dst[1 * stride + 3] =
+        dst[2 * stride + 0] = dst[2 * stride + 1] =
+        dst[2 * stride + 2] = dst[2 * stride + 3] =
+        dst[3 * stride + 0] = dst[3 * stride + 1] =
+        dst[3 * stride + 2] = dst[3 * stride + 3] = value;
+    } else {
+        dst[0 * stride + 0] = dst[0 * stride + 1] =
+        dst[1 * stride + 0] = dst[1 * stride + 1] = value;
+    }
+}
 
 /**
  * @defgroup block RV30/40 4x4 block decoding functions
@@ -585,7 +608,7 @@
         }
     }
     if(block_type == RV34_MB_B_BACKWARD || block_type == RV34_MB_B_FORWARD)
-        fill_rectangle(cur_pic->motion_val[!dir][mv_pos], 2, 2, s->b8_stride, 0, 4);
+        fill_uint32_block(cur_pic->motion_val[!dir][mv_pos], 2, s->b8_stride, 0);
 }
 
 /**
@@ -806,11 +829,11 @@
     switch(block_type){
     case RV34_MB_TYPE_INTRA:
     case RV34_MB_TYPE_INTRA16x16:
-        fill_rectangle(s->current_picture_ptr->motion_val[0][s->mb_x * 2 + s->mb_y * 2 * s->b8_stride], 2, 2, s->b8_stride, 0, 4);
+        fill_uint32_block(s->current_picture_ptr->motion_val[0][s->mb_x * 2 + s->mb_y * 2 * s->b8_stride], 2, s->b8_stride, 0);
         return 0;
     case RV34_MB_SKIP:
         if(s->pict_type == FF_P_TYPE){
-            fill_rectangle(s->current_picture_ptr->motion_val[0][s->mb_x * 2 + s->mb_y * 2 * s->b8_stride], 2, 2, s->b8_stride, 0, 4);
+            fill_uint32_block(s->current_picture_ptr->motion_val[0][s->mb_x * 2 + s->mb_y * 2 * s->b8_stride], 2, s->b8_stride, 0);
             rv34_mc_1mv (r, block_type, 0, 0, 0, 2, 2, 0);
             break;
         }
@@ -818,8 +841,8 @@
         //surprisingly, it uses motion scheme from next reference frame
         next_bt = s->next_picture_ptr->mb_type[s->mb_x + s->mb_y * s->mb_stride];
         if(IS_INTRA(next_bt) || IS_SKIP(next_bt)){
-            fill_rectangle(s->current_picture_ptr->motion_val[0][s->mb_x * 2 + s->mb_y * 2 * s->b8_stride], 2, 2, s->b8_stride, 0, 4);
-            fill_rectangle(s->current_picture_ptr->motion_val[1][s->mb_x * 2 + s->mb_y * 2 * s->b8_stride], 2, 2, s->b8_stride, 0, 4);
+            fill_uint32_block(s->current_picture_ptr->motion_val[0][s->mb_x * 2 + s->mb_y * 2 * s->b8_stride], 2, s->b8_stride, 0);
+            fill_uint32_block(s->current_picture_ptr->motion_val[1][s->mb_x * 2 + s->mb_y * 2 * s->b8_stride], 2, s->b8_stride, 0);
         }else
             for(j = 0; j < 2; j++)
                 for(i = 0; i < 2; i++)
@@ -830,7 +853,7 @@
             rv34_mc_2mv(r, block_type);
         else
             rv34_mc_2mv_skip(r);
-        fill_rectangle(s->current_picture_ptr->motion_val[0][s->mb_x * 2 + s->mb_y * 2 * s->b8_stride], 2, 2, s->b8_stride, 0, 4);
+        fill_uint32_block(s->current_picture_ptr->motion_val[0][s->mb_x * 2 + s->mb_y * 2 * s->b8_stride], 2, s->b8_stride, 0);
         break;
     case RV34_MB_P_16x16:
     case RV34_MB_P_MIX16x16:
@@ -987,7 +1010,7 @@
             intra_types += r->intra_types_stride;
         }
         intra_types -= r->intra_types_stride * 4;
-        fill_rectangle(r->avail_cache + 5, 2, 2, 4, 0, 4);
+        fill_uint32_block(r->avail_cache + 5, 2, 4, 0);
         for(j = 0; j < 2; j++){
             idx = 5 + j*4;
             for(i = 0; i < 2; i++, cbp >>= 1, idx++){
@@ -1173,7 +1196,7 @@
 
     // Calculate which neighbours are available. Maybe it's worth optimizing too.
     memset(r->avail_cache, 0, sizeof(r->avail_cache));
-    fill_rectangle(r->avail_cache + 5, 2, 2, 4, 1, 4);
+    fill_uint32_block(r->avail_cache + 5, 2, 4, 1);
     dist = (s->mb_x - s->resync_mb_x) + (s->mb_y - s->resync_mb_y) * s->mb_width;
     if(s->mb_x && dist)
         r->avail_cache[4] =