Browse code

Move dnxhdenc to execute2 multithreading API. This allows for some simplifications like removing some outer loops and gives much better performance for thread_count > number of idle CPUs.

Originally committed as revision 20211 to svn://svn.ffmpeg.org/ffmpeg/trunk

Reimar Döffinger authored on 2009/10/12 23:43:57
Showing 2 changed files
... ...
@@ -204,6 +204,7 @@ static int dnxhd_encode_init(AVCodecContext *avctx)
204 204
         return -1;
205 205
 
206 206
     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_size, ctx->m.mb_height*sizeof(uint32_t), fail);
207
+    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_offs, ctx->m.mb_height*sizeof(uint32_t), fail);
207 208
     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_bits,    ctx->m.mb_num   *sizeof(uint16_t), fail);
208 209
     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_qscale,  ctx->m.mb_num   *sizeof(uint8_t) , fail);
209 210
 
... ...
@@ -211,7 +212,7 @@ static int dnxhd_encode_init(AVCodecContext *avctx)
211 211
     ctx->frame.pict_type = FF_I_TYPE;
212 212
     ctx->m.avctx->coded_frame = &ctx->frame;
213 213
 
214
-    if (avctx->thread_count > MAX_THREADS || (avctx->thread_count > ctx->m.mb_height)) {
214
+    if (avctx->thread_count > MAX_THREADS) {
215 215
         av_log(avctx, AV_LOG_ERROR, "too many threads\n");
216 216
         return -1;
217 217
     }
... ...
@@ -222,11 +223,6 @@ static int dnxhd_encode_init(AVCodecContext *avctx)
222 222
         memcpy(ctx->thread[i], ctx, sizeof(DNXHDEncContext));
223 223
     }
224 224
 
225
-    for (i = 0; i < avctx->thread_count; i++) {
226
-        ctx->thread[i]->m.start_mb_y = (ctx->m.mb_height*(i  ) + avctx->thread_count/2) / avctx->thread_count;
227
-        ctx->thread[i]->m.end_mb_y   = (ctx->m.mb_height*(i+1) + avctx->thread_count/2) / avctx->thread_count;
228
-    }
229
-
230 225
     return 0;
231 226
  fail: //for FF_ALLOCZ_OR_GOTO
232 227
     return -1;
... ...
@@ -397,13 +393,13 @@ static av_always_inline int dnxhd_switch_matrix(DNXHDEncContext *ctx, int i)
397 397
     }
398 398
 }
399 399
 
400
-static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg)
400
+static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
401 401
 {
402
-    DNXHDEncContext *ctx = *(void**)arg;
403
-    int mb_y, mb_x;
404
-    int qscale = ctx->thread[0]->qscale;
402
+    DNXHDEncContext *ctx = avctx->priv_data;
403
+    int mb_y = jobnr, mb_x;
404
+    int qscale = ctx->qscale;
405
+    ctx = ctx->thread[threadnr];
405 406
 
406
-    for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) {
407 407
         ctx->m.last_dc[0] =
408 408
         ctx->m.last_dc[1] =
409 409
         ctx->m.last_dc[2] = 1024;
... ...
@@ -443,16 +439,16 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg)
443 443
             ctx->mb_rc[qscale][mb].ssd = ssd;
444 444
             ctx->mb_rc[qscale][mb].bits = ac_bits+dc_bits+12+8*ctx->vlc_bits[0];
445 445
         }
446
-    }
447 446
     return 0;
448 447
 }
449 448
 
450
-static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg)
449
+static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
451 450
 {
452
-    DNXHDEncContext *ctx = *(void**)arg;
453
-    int mb_y, mb_x;
451
+    DNXHDEncContext *ctx = avctx->priv_data;
452
+    int mb_y = jobnr, mb_x;
453
+    ctx = ctx->thread[threadnr];
454
+    init_put_bits(&ctx->m.pb, (uint8_t *)arg + 640 + ctx->slice_offs[jobnr], ctx->slice_size[jobnr]);
454 455
 
455
-    for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) {
456 456
         ctx->m.last_dc[0] =
457 457
         ctx->m.last_dc[1] =
458 458
         ctx->m.last_dc[2] = 1024;
... ...
@@ -477,18 +473,17 @@ static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg)
477 477
         }
478 478
         if (put_bits_count(&ctx->m.pb)&31)
479 479
             put_bits(&ctx->m.pb, 32-(put_bits_count(&ctx->m.pb)&31), 0);
480
-    }
481 480
     flush_put_bits(&ctx->m.pb);
482 481
     return 0;
483 482
 }
484 483
 
485
-static void dnxhd_setup_threads_slices(DNXHDEncContext *ctx, uint8_t *buf)
484
+static void dnxhd_setup_threads_slices(DNXHDEncContext *ctx)
486 485
 {
487 486
     int mb_y, mb_x;
488
-    int i, offset = 0;
489
-    for (i = 0; i < ctx->m.avctx->thread_count; i++) {
490
-        int thread_size = 0;
491
-        for (mb_y = ctx->thread[i]->m.start_mb_y; mb_y < ctx->thread[i]->m.end_mb_y; mb_y++) {
487
+    int offset = 0;
488
+    for (mb_y = 0; mb_y < ctx->m.mb_height; mb_y++) {
489
+        int thread_size;
490
+        ctx->slice_offs[mb_y] = offset;
492 491
             ctx->slice_size[mb_y] = 0;
493 492
             for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
494 493
                 unsigned mb = mb_y * ctx->m.mb_width + mb_x;
... ...
@@ -496,18 +491,16 @@ static void dnxhd_setup_threads_slices(DNXHDEncContext *ctx, uint8_t *buf)
496 496
             }
497 497
             ctx->slice_size[mb_y] = (ctx->slice_size[mb_y]+31)&~31;
498 498
             ctx->slice_size[mb_y] >>= 3;
499
-            thread_size += ctx->slice_size[mb_y];
500
-        }
501
-        init_put_bits(&ctx->thread[i]->m.pb, buf + 640 + offset, thread_size);
499
+            thread_size = ctx->slice_size[mb_y];
502 500
         offset += thread_size;
503 501
     }
504 502
 }
505 503
 
506
-static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg)
504
+static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
507 505
 {
508
-    DNXHDEncContext *ctx = *(void**)arg;
509
-    int mb_y, mb_x;
510
-    for (mb_y = ctx->m.start_mb_y; mb_y < ctx->m.end_mb_y; mb_y++) {
506
+    DNXHDEncContext *ctx = avctx->priv_data;
507
+    int mb_y = jobnr, mb_x;
508
+    ctx = ctx->thread[threadnr];
511 509
         for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
512 510
             unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
513 511
             uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize) + (mb_x<<4);
... ...
@@ -516,7 +509,6 @@ static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg)
516 516
             ctx->mb_cmp[mb].value = varc;
517 517
             ctx->mb_cmp[mb].mb = mb;
518 518
         }
519
-    }
520 519
     return 0;
521 520
 }
522 521
 
... ...
@@ -528,7 +520,7 @@ static int dnxhd_encode_rdo(AVCodecContext *avctx, DNXHDEncContext *ctx)
528 528
 
529 529
     for (q = 1; q < avctx->qmax; q++) {
530 530
         ctx->qscale = q;
531
-        avctx->execute(avctx, dnxhd_calc_bits_thread, &ctx->thread[0], NULL, avctx->thread_count, sizeof(void*));
531
+        avctx->execute2(avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height);
532 532
     }
533 533
     up_step = down_step = 2<<LAMBDA_FRAC_BITS;
534 534
     lambda = ctx->lambda;
... ...
@@ -608,7 +600,7 @@ static int dnxhd_find_qscale(DNXHDEncContext *ctx)
608 608
         bits = 0;
609 609
         ctx->qscale = qscale;
610 610
         // XXX avoid recalculating bits
611
-        ctx->m.avctx->execute(ctx->m.avctx, dnxhd_calc_bits_thread, &ctx->thread[0], NULL, ctx->m.avctx->thread_count, sizeof(void*));
611
+        ctx->m.avctx->execute2(ctx->m.avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height);
612 612
         for (y = 0; y < ctx->m.mb_height; y++) {
613 613
             for (x = 0; x < ctx->m.mb_width; x++)
614 614
                 bits += ctx->mb_rc[qscale][y*ctx->m.mb_width+x].bits;
... ...
@@ -732,7 +724,7 @@ static int dnxhd_encode_fast(AVCodecContext *avctx, DNXHDEncContext *ctx)
732 732
     }
733 733
     if (!ret) {
734 734
         if (RC_VARIANCE)
735
-            avctx->execute(avctx, dnxhd_mb_var_thread, &ctx->thread[0], NULL, avctx->thread_count, sizeof(void*));
735
+            avctx->execute2(avctx, dnxhd_mb_var_thread, NULL, NULL, ctx->m.mb_height);
736 736
         radix_sort(ctx->mb_cmp, ctx->m.mb_num);
737 737
         for (x = 0; x < ctx->m.mb_num && max_bits > ctx->frame_bits; x++) {
738 738
             int mb = ctx->mb_cmp[x].mb;
... ...
@@ -795,7 +787,7 @@ static int dnxhd_encode_picture(AVCodecContext *avctx, unsigned char *buf, int b
795 795
         return -1;
796 796
     }
797 797
 
798
-    dnxhd_setup_threads_slices(ctx, buf);
798
+    dnxhd_setup_threads_slices(ctx);
799 799
 
800 800
     offset = 0;
801 801
     for (i = 0; i < ctx->m.mb_height; i++) {
... ...
@@ -804,7 +796,7 @@ static int dnxhd_encode_picture(AVCodecContext *avctx, unsigned char *buf, int b
804 804
         assert(!(ctx->slice_size[i] & 3));
805 805
     }
806 806
 
807
-    avctx->execute(avctx, dnxhd_encode_thread, &ctx->thread[0], NULL, avctx->thread_count, sizeof(void*));
807
+    avctx->execute2(avctx, dnxhd_encode_thread, buf, NULL, ctx->m.mb_height);
808 808
 
809 809
     assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size);
810 810
     memset(buf + 640 + offset, 0, ctx->cid_table->coding_unit_size - 4 - offset - 640);
... ...
@@ -840,6 +832,7 @@ static int dnxhd_encode_end(AVCodecContext *avctx)
840 840
     av_freep(&ctx->mb_rc);
841 841
     av_freep(&ctx->mb_cmp);
842 842
     av_freep(&ctx->slice_size);
843
+    av_freep(&ctx->slice_offs);
843 844
 
844 845
     av_freep(&ctx->qmatrix_c);
845 846
     av_freep(&ctx->qmatrix_l);
... ...
@@ -46,6 +46,7 @@ typedef struct DNXHDEncContext {
46 46
     const CIDEntry *cid_table;
47 47
     uint8_t *msip; ///< Macroblock Scan Indexes Payload
48 48
     uint32_t *slice_size;
49
+    uint32_t *slice_offs;
49 50
 
50 51
     struct DNXHDEncContext *thread[MAX_THREADS];
51 52