Browse code

Optimize C version of ff_emulated_edge_mc().

From ~780 cycles to 551 cycles, mostly just by using libc memcpy()
instead of manually shuffling individual bytes around.

Ronald S. Bultje authored on 2011/01/27 12:37:50
Showing 1 changed files
... ...
@@ -355,38 +355,45 @@ void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int blo
355 355
     start_x= FFMAX(0, -src_x);
356 356
     end_y= FFMIN(block_h, h-src_y);
357 357
     end_x= FFMIN(block_w, w-src_x);
358
+    assert(start_y < end_y && block_h);
359
+    assert(start_x < end_x && block_w);
358 360
 
359
-    // copy existing part
360
-    for(y=start_y; y<end_y; y++){
361
-        for(x=start_x; x<end_x; x++){
362
-            buf[x + y*linesize]= src[x + y*linesize];
363
-        }
364
-    }
361
+    w    = end_x - start_x;
362
+    src += start_y*linesize + start_x;
363
+    buf += start_x;
365 364
 
366 365
     //top
367 366
     for(y=0; y<start_y; y++){
368
-        for(x=start_x; x<end_x; x++){
369
-            buf[x + y*linesize]= buf[x + start_y*linesize];
370
-        }
367
+        memcpy(buf, src, w);
368
+        buf += linesize;
369
+    }
370
+
371
+    // copy existing part
372
+    for(; y<end_y; y++){
373
+        memcpy(buf, src, w);
374
+        src += linesize;
375
+        buf += linesize;
371 376
     }
372 377
 
373 378
     //bottom
374
-    for(y=end_y; y<block_h; y++){
375
-        for(x=start_x; x<end_x; x++){
376
-            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
377
-        }
379
+    src -= linesize;
380
+    for(; y<block_h; y++){
381
+        memcpy(buf, src, w);
382
+        buf += linesize;
378 383
     }
379 384
 
380
-    for(y=0; y<block_h; y++){
385
+    buf -= block_h * linesize + start_x;
386
+    while (block_h--){
381 387
        //left
382 388
         for(x=0; x<start_x; x++){
383
-            buf[x + y*linesize]= buf[start_x + y*linesize];
389
+            buf[x] = buf[start_x];
384 390
         }
385 391
 
386 392
        //right
387 393
         for(x=end_x; x<block_w; x++){
388
-            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
394
+            buf[x] = buf[end_x - 1];
389 395
         }
396
+        buf += linesize;
390 397
     }
391 398
 }
392 399