Browse code

Merge remote-tracking branch 'qatar/master'

* qatar/master:
swscale: remove misplaced comment.
ffmpeg: fix streaming to ffserver.
swscale: split out RGB48 output functions from yuv2packed[12X]_c().
build: move vpath directives to main Makefile
swscale: fix JPEG-range YUV scaling artifacts.
build: move ALLFFLIBS to a more logical place
ARM: factor some repetitive code into macros
Fix SVQ3 after adding 4:4:4 H.264 support
H.264: fix CODEC_FLAG_GRAY
4:4:4 H.264 decoding support
ac3enc: fix allocation of floating point samples.

Conflicts:
ffmpeg.c
libavcodec/dsputil_template.c
libavcodec/h264.c
libavcodec/mpegvideo.c
libavcodec/snow.c
libswscale/swscale.c
libswscale/swscale_internal.h

Merged-by: Michael Niedermayer <michaelni@gmx.at>

Michael Niedermayer authored on 2011/06/15 09:15:25
Showing 27 changed files
... ...
@@ -2,6 +2,11 @@ include config.mak
2 2
 
3 3
 SRC_DIR = $(SRC_PATH_BARE)
4 4
 
5
+vpath %.c   $(SRC_DIR)
6
+vpath %.h   $(SRC_DIR)
7
+vpath %.S   $(SRC_DIR)
8
+vpath %.asm $(SRC_DIR)
9
+vpath %.v   $(SRC_DIR)
5 10
 vpath %.texi $(SRC_PATH_BARE)
6 11
 
7 12
 PROGS-$(CONFIG_FFMPEG)   += ffmpeg
... ...
@@ -24,6 +29,8 @@ ALLPROGS    = $(BASENAMES:%=%$(EXESUF))
24 24
 ALLPROGS_G  = $(BASENAMES:%=%_g$(EXESUF))
25 25
 ALLMANPAGES = $(BASENAMES:%=%.1)
26 26
 
27
+ALLFFLIBS = avcodec avdevice avfilter avformat avutil postproc swscale
28
+
27 29
 FFLIBS-$(CONFIG_AVDEVICE) += avdevice
28 30
 FFLIBS-$(CONFIG_AVFILTER) += avfilter
29 31
 FFLIBS-$(CONFIG_AVFORMAT) += avformat
... ...
@@ -6,11 +6,6 @@
6 6
 all: all-yes
7 7
 
8 8
 ifndef SUBDIR
9
-vpath %.c   $(SRC_DIR)
10
-vpath %.h   $(SRC_DIR)
11
-vpath %.S   $(SRC_DIR)
12
-vpath %.asm $(SRC_DIR)
13
-vpath %.v   $(SRC_DIR)
14 9
 
15 10
 ifndef V
16 11
 Q      = @
... ...
@@ -25,8 +20,6 @@ $(foreach VAR,$(SILENT),$(eval override $(VAR) = @$($(VAR))))
25 25
 $(eval INSTALL = @$(call ECHO,INSTALL,$$(^:$(SRC_DIR)/%=%)); $(INSTALL))
26 26
 endif
27 27
 
28
-ALLFFLIBS = avcodec avdevice avfilter avformat avutil postproc swscale
29
-
30 28
 IFLAGS   := -I. -I$(SRC_PATH)
31 29
 CPPFLAGS := $(IFLAGS) $(CPPFLAGS)
32 30
 CFLAGS   += $(ECFLAGS)
... ...
@@ -2215,15 +2215,9 @@ static av_cold int allocate_buffers(AVCodecContext *avctx)
2215 2215
     AC3EncodeContext *s = avctx->priv_data;
2216 2216
     int channels = s->channels + 1; /* includes coupling channel */
2217 2217
 
2218
-    FF_ALLOC_OR_GOTO(avctx, s->windowed_samples, AC3_WINDOW_SIZE *
2219
-                     sizeof(*s->windowed_samples), alloc_fail);
2220
-    FF_ALLOC_OR_GOTO(avctx, s->planar_samples, s->channels * sizeof(*s->planar_samples),
2221
-                     alloc_fail);
2222
-    for (ch = 0; ch < s->channels; ch++) {
2223
-        FF_ALLOCZ_OR_GOTO(avctx, s->planar_samples[ch],
2224
-                          (AC3_FRAME_SIZE+AC3_BLOCK_SIZE) * sizeof(**s->planar_samples),
2225
-                          alloc_fail);
2226
-    }
2218
+    if (s->allocate_sample_buffers(s))
2219
+        goto alloc_fail;
2220
+
2227 2221
     FF_ALLOC_OR_GOTO(avctx, s->bap_buffer,  AC3_MAX_BLOCKS * channels *
2228 2222
                      AC3_MAX_COEFS * sizeof(*s->bap_buffer),  alloc_fail);
2229 2223
     FF_ALLOC_OR_GOTO(avctx, s->bap1_buffer, AC3_MAX_BLOCKS * channels *
... ...
@@ -2323,6 +2317,8 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx)
2323 2323
     AC3EncodeContext *s = avctx->priv_data;
2324 2324
     int ret, frame_size_58;
2325 2325
 
2326
+    s->avctx = avctx;
2327
+
2326 2328
     s->eac3 = avctx->codec_id == CODEC_ID_EAC3;
2327 2329
 
2328 2330
     avctx->frame_size = AC3_FRAME_SIZE;
... ...
@@ -2355,6 +2351,7 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx)
2355 2355
         s->apply_window                 = ff_ac3_fixed_apply_window;
2356 2356
         s->normalize_samples            = ff_ac3_fixed_normalize_samples;
2357 2357
         s->scale_coefficients           = ff_ac3_fixed_scale_coefficients;
2358
+        s->allocate_sample_buffers      = ff_ac3_fixed_allocate_sample_buffers;
2358 2359
         s->deinterleave_input_samples   = ff_ac3_fixed_deinterleave_input_samples;
2359 2360
         s->apply_mdct                   = ff_ac3_fixed_apply_mdct;
2360 2361
         s->apply_channel_coupling       = ff_ac3_fixed_apply_channel_coupling;
... ...
@@ -2364,6 +2361,7 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx)
2364 2364
         s->mdct_init                    = ff_ac3_float_mdct_init;
2365 2365
         s->apply_window                 = ff_ac3_float_apply_window;
2366 2366
         s->scale_coefficients           = ff_ac3_float_scale_coefficients;
2367
+        s->allocate_sample_buffers      = ff_ac3_float_allocate_sample_buffers;
2367 2368
         s->deinterleave_input_samples   = ff_ac3_float_deinterleave_input_samples;
2368 2369
         s->apply_mdct                   = ff_ac3_float_apply_mdct;
2369 2370
         s->apply_channel_coupling       = ff_ac3_float_apply_channel_coupling;
... ...
@@ -135,6 +135,7 @@ typedef struct AC3Block {
135 135
 typedef struct AC3EncodeContext {
136 136
     AVClass *av_class;                      ///< AVClass used for AVOption
137 137
     AC3EncOptions options;                  ///< encoding options
138
+    AVCodecContext *avctx;                  ///< parent AVCodecContext
138 139
     PutBitContext pb;                       ///< bitstream writer context
139 140
     DSPContext dsp;
140 141
     AC3DSPContext ac3dsp;                   ///< AC-3 optimized functions
... ...
@@ -230,6 +231,7 @@ typedef struct AC3EncodeContext {
230 230
     void (*scale_coefficients)(struct AC3EncodeContext *s);
231 231
 
232 232
     /* fixed vs. float templated function pointers */
233
+    int  (*allocate_sample_buffers)(struct AC3EncodeContext *s);
233 234
     void (*deinterleave_input_samples)(struct AC3EncodeContext *s,
234 235
                                        const SampleType *samples);
235 236
     void (*apply_mdct)(struct AC3EncodeContext *s);
... ...
@@ -276,6 +278,9 @@ void ff_ac3_float_scale_coefficients(AC3EncodeContext *s);
276 276
 
277 277
 /* prototypes for functions in ac3enc_template.c */
278 278
 
279
+int ff_ac3_fixed_allocate_sample_buffers(AC3EncodeContext *s);
280
+int ff_ac3_float_allocate_sample_buffers(AC3EncodeContext *s);
281
+
279 282
 void ff_ac3_fixed_deinterleave_input_samples(AC3EncodeContext *s,
280 283
                                              const SampleType *samples);
281 284
 void ff_ac3_float_deinterleave_input_samples(AC3EncodeContext *s,
... ...
@@ -31,6 +31,26 @@
31 31
 #include "ac3enc.h"
32 32
 
33 33
 
34
+int AC3_NAME(allocate_sample_buffers)(AC3EncodeContext *s)
35
+{
36
+    int ch;
37
+
38
+    FF_ALLOC_OR_GOTO(s->avctx, s->windowed_samples, AC3_WINDOW_SIZE *
39
+                     sizeof(*s->windowed_samples), alloc_fail);
40
+    FF_ALLOC_OR_GOTO(s->avctx, s->planar_samples, s->channels * sizeof(*s->planar_samples),
41
+                     alloc_fail);
42
+    for (ch = 0; ch < s->channels; ch++) {
43
+        FF_ALLOCZ_OR_GOTO(s->avctx, s->planar_samples[ch],
44
+                          (AC3_FRAME_SIZE+AC3_BLOCK_SIZE) * sizeof(**s->planar_samples),
45
+                          alloc_fail);
46
+    }
47
+
48
+    return 0;
49
+alloc_fail:
50
+    return AVERROR(ENOMEM);
51
+}
52
+
53
+
34 54
 /**
35 55
  * Deinterleave input samples.
36 56
  * Channels are reordered from Libav's default order to AC-3 order.
... ...
@@ -122,7 +122,8 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth)
122 122
     c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
123 123
     c->h264_idct_add16      = ff_h264_idct_add16_neon;
124 124
     c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
125
-    c->h264_idct_add8       = ff_h264_idct_add8_neon;
125
+    //FIXME: reenable when asm is updated.
126
+    //c->h264_idct_add8       = ff_h264_idct_add8_neon;
126 127
     c->h264_idct8_add       = ff_h264_idct8_add_neon;
127 128
     c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_neon;
128 129
     c->h264_idct8_add4      = ff_h264_idct8_add4_neon;
... ...
@@ -35,6 +35,21 @@
35 35
  *
36 36
  * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
37 37
  */
38
+
39
+.macro  dequant_t       dst, src, mul, add, tmp
40
+        rsbs            \tmp, ip, \src, asr #16
41
+        addgt           \tmp, \add, #0
42
+        rsblt           \tmp, \add, #0
43
+        smlatbne        \dst, \src, \mul, \tmp
44
+.endm
45
+
46
+.macro  dequant_b       dst, src, mul, add, tmp
47
+        rsbs            \tmp, ip, \src, lsl #16
48
+        addgt           \tmp, \add, #0
49
+        rsblt           \tmp, \add, #0
50
+        smlabbne        \dst, \src, \mul, \tmp
51
+.endm
52
+
38 53
 function ff_dct_unquantize_h263_armv5te, export=1
39 54
         push            {r4-r9,lr}
40 55
         mov             ip, #0
... ...
@@ -44,50 +59,20 @@ function ff_dct_unquantize_h263_armv5te, export=1
44 44
 1:
45 45
         ldrd            r6, [r0, #8]
46 46
 
47
-        rsbs            r9, ip, r4, asr #16
48
-        addgt           r9, r2, #0
49
-        rsblt           r9, r2, #0
50
-        smlatbne        r9, r4, r1, r9
51
-
52
-        rsbs            lr, ip, r5, asr #16
53
-        addgt           lr, r2, #0
54
-        rsblt           lr, r2, #0
55
-        smlatbne        lr, r5, r1, lr
56
-
57
-        rsbs            r8, ip, r4, asl #16
58
-        addgt           r8, r2, #0
59
-        rsblt           r8, r2, #0
60
-        smlabbne        r4, r4, r1, r8
61
-
62
-        rsbs            r8, ip, r5, asl #16
63
-        addgt           r8, r2, #0
64
-        rsblt           r8, r2, #0
65
-        smlabbne        r5, r5, r1, r8
47
+        dequant_t       r9, r4, r1, r2, r9
48
+        dequant_t       lr, r5, r1, r2, lr
49
+        dequant_b       r4, r4, r1, r2, r8
50
+        dequant_b       r5, r5, r1, r2, r8
66 51
 
67 52
         strh            r4, [r0], #2
68 53
         strh            r9, [r0], #2
69 54
         strh            r5, [r0], #2
70 55
         strh            lr, [r0], #2
71 56
 
72
-        rsbs            r9, ip, r6, asr #16
73
-        addgt           r9, r2, #0
74
-        rsblt           r9, r2, #0
75
-        smlatbne        r9, r6, r1, r9
76
-
77
-        rsbs            lr, ip, r7, asr #16
78
-        addgt           lr, r2, #0
79
-        rsblt           lr, r2, #0
80
-        smlatbne        lr, r7, r1, lr
81
-
82
-        rsbs            r8, ip, r6, asl #16
83
-        addgt           r8, r2, #0
84
-        rsblt           r8, r2, #0
85
-        smlabbne        r6, r6, r1, r8
86
-
87
-        rsbs            r8, ip, r7, asl #16
88
-        addgt           r8, r2, #0
89
-        rsblt           r8, r2, #0
90
-        smlabbne        r7, r7, r1, r8
57
+        dequant_t       r9, r6, r1, r2, r9
58
+        dequant_t       lr, r7, r1, r2, lr
59
+        dequant_b       r6, r6, r1, r2, r8
60
+        dequant_b       r7, r7, r1, r2, r8
91 61
 
92 62
         strh            r6, [r0], #2
93 63
         strh            r9, [r0], #2
... ...
@@ -333,6 +333,20 @@ function idct_col_armv5te
333 333
         ldr    pc, [sp], #4
334 334
 endfunc
335 335
 
336
+.macro  clip   dst, src:vararg
337
+        movs   \dst, \src
338
+        movmi  \dst, #0
339
+        cmp    \dst, #255
340
+        movgt  \dst, #255
341
+.endm
342
+
343
+.macro  aclip  dst, src:vararg
344
+        adds   \dst, \src
345
+        movmi  \dst, #0
346
+        cmp    \dst, #255
347
+        movgt  \dst, #255
348
+.endm
349
+
336 350
 function idct_col_put_armv5te
337 351
         str    lr, [sp, #-4]!
338 352
 
... ...
@@ -341,27 +355,15 @@ function idct_col_put_armv5te
341 341
         ldmfd  sp!, {a3, a4}
342 342
         ldr    lr, [sp, #32]
343 343
         add    a2, a3, v1
344
-        movs   a2, a2, asr #20
345
-        movmi  a2, #0
346
-        cmp    a2, #255
347
-        movgt  a2, #255
344
+        clip   a2, a2, asr #20
348 345
         add    ip, a4, v2
349
-        movs   ip, ip, asr #20
350
-        movmi  ip, #0
351
-        cmp    ip, #255
352
-        movgt  ip, #255
346
+        clip   ip, ip, asr #20
353 347
         orr    a2, a2, ip, lsl #8
354 348
         sub    a3, a3, v1
355
-        movs   a3, a3, asr #20
356
-        movmi  a3, #0
357
-        cmp    a3, #255
358
-        movgt  a3, #255
349
+        clip   a3, a3, asr #20
359 350
         sub    a4, a4, v2
360
-        movs   a4, a4, asr #20
361
-        movmi  a4, #0
362
-        cmp    a4, #255
351
+        clip   a4, a4, asr #20
363 352
         ldr    v1, [sp, #28]
364
-        movgt  a4, #255
365 353
         strh   a2, [v1]
366 354
         add    a2, v1, #2
367 355
         str    a2, [sp, #28]
... ...
@@ -371,79 +373,43 @@ function idct_col_put_armv5te
371 371
         strh   a2, [v2, v1]!
372 372
 
373 373
         sub    a2, a3, v3
374
-        movs   a2, a2, asr #20
375
-        movmi  a2, #0
376
-        cmp    a2, #255
377
-        movgt  a2, #255
374
+        clip   a2, a2, asr #20
378 375
         sub    ip, a4, v4
379
-        movs   ip, ip, asr #20
380
-        movmi  ip, #0
381
-        cmp    ip, #255
382
-        movgt  ip, #255
376
+        clip   ip, ip, asr #20
383 377
         orr    a2, a2, ip, lsl #8
384 378
         strh   a2, [v1, lr]!
385 379
         add    a3, a3, v3
386
-        movs   a2, a3, asr #20
387
-        movmi  a2, #0
388
-        cmp    a2, #255
389
-        movgt  a2, #255
380
+        clip   a2, a3, asr #20
390 381
         add    a4, a4, v4
391
-        movs   a4, a4, asr #20
392
-        movmi  a4, #0
393
-        cmp    a4, #255
394
-        movgt  a4, #255
382
+        clip   a4, a4, asr #20
395 383
         orr    a2, a2, a4, lsl #8
396 384
         ldmfd  sp!, {a3, a4}
397 385
         strh   a2, [v2, -lr]!
398 386
 
399 387
         add    a2, a3, v5
400
-        movs   a2, a2, asr #20
401
-        movmi  a2, #0
402
-        cmp    a2, #255
403
-        movgt  a2, #255
388
+        clip   a2, a2, asr #20
404 389
         add    ip, a4, v6
405
-        movs   ip, ip, asr #20
406
-        movmi  ip, #0
407
-        cmp    ip, #255
408
-        movgt  ip, #255
390
+        clip   ip, ip, asr #20
409 391
         orr    a2, a2, ip, lsl #8
410 392
         strh   a2, [v1, lr]!
411 393
         sub    a3, a3, v5
412
-        movs   a2, a3, asr #20
413
-        movmi  a2, #0
414
-        cmp    a2, #255
415
-        movgt  a2, #255
394
+        clip   a2, a3, asr #20
416 395
         sub    a4, a4, v6
417
-        movs   a4, a4, asr #20
418
-        movmi  a4, #0
419
-        cmp    a4, #255
420
-        movgt  a4, #255
396
+        clip   a4, a4, asr #20
421 397
         orr    a2, a2, a4, lsl #8
422 398
         ldmfd  sp!, {a3, a4}
423 399
         strh   a2, [v2, -lr]!
424 400
 
425 401
         add    a2, a3, v7
426
-        movs   a2, a2, asr #20
427
-        movmi  a2, #0
428
-        cmp    a2, #255
429
-        movgt  a2, #255
402
+        clip   a2, a2, asr #20
430 403
         add    ip, a4, fp
431
-        movs   ip, ip, asr #20
432
-        movmi  ip, #0
433
-        cmp    ip, #255
434
-        movgt  ip, #255
404
+        clip   ip, ip, asr #20
435 405
         orr    a2, a2, ip, lsl #8
436 406
         strh   a2, [v1, lr]
437 407
         sub    a3, a3, v7
438
-        movs   a2, a3, asr #20
439
-        movmi  a2, #0
440
-        cmp    a2, #255
441
-        movgt  a2, #255
408
+        clip   a2, a3, asr #20
442 409
         sub    a4, a4, fp
443
-        movs   a4, a4, asr #20
444
-        movmi  a4, #0
445
-        cmp    a4, #255
446
-        movgt  a4, #255
410
+        clip   a4, a4, asr #20
447 411
         orr    a2, a2, a4, lsl #8
448 412
         strh   a2, [v2, -lr]
449 413
 
... ...
@@ -460,36 +426,22 @@ function idct_col_add_armv5te
460 460
         ldmfd  sp!, {a3, a4}
461 461
         ldrh   ip, [lr]
462 462
         add    a2, a3, v1
463
-        mov    a2, a2, asr #20
464 463
         sub    a3, a3, v1
465 464
         and    v1, ip, #255
466
-        adds   a2, a2, v1
467
-        movmi  a2, #0
468
-        cmp    a2, #255
469
-        movgt  a2, #255
465
+        aclip  a2, v1, a2, asr #20
470 466
         add    v1, a4, v2
471 467
         mov    v1, v1, asr #20
472
-        adds   v1, v1, ip, lsr #8
473
-        movmi  v1, #0
474
-        cmp    v1, #255
475
-        movgt  v1, #255
468
+        aclip  v1, v1, ip, lsr #8
476 469
         orr    a2, a2, v1, lsl #8
477 470
         ldr    v1, [sp, #32]
478 471
         sub    a4, a4, v2
479 472
         rsb    v2, v1, v1, lsl #3
480 473
         ldrh   ip, [v2, lr]!
481 474
         strh   a2, [lr]
482
-        mov    a3, a3, asr #20
483 475
         and    a2, ip, #255
484
-        adds   a3, a3, a2
485
-        movmi  a3, #0
486
-        cmp    a3, #255
487
-        movgt  a3, #255
476
+        aclip  a3, a2, a3, asr #20
488 477
         mov    a4, a4, asr #20
489
-        adds   a4, a4, ip, lsr #8
490
-        movmi  a4, #0
491
-        cmp    a4, #255
492
-        movgt  a4, #255
478
+        aclip  a4, a4, ip, lsr #8
493 479
         add    a2, lr, #2
494 480
         str    a2, [sp, #28]
495 481
         orr    a2, a3, a4, lsl #8
... ...
@@ -498,102 +450,60 @@ function idct_col_add_armv5te
498 498
         ldmfd  sp!, {a3, a4}
499 499
         ldrh   ip, [lr, v1]!
500 500
         sub    a2, a3, v3
501
-        mov    a2, a2, asr #20
502 501
         add    a3, a3, v3
503 502
         and    v3, ip, #255
504
-        adds   a2, a2, v3
505
-        movmi  a2, #0
506
-        cmp    a2, #255
507
-        movgt  a2, #255
503
+        aclip  a2, v3, a2, asr #20
508 504
         sub    v3, a4, v4
509 505
         mov    v3, v3, asr #20
510
-        adds   v3, v3, ip, lsr #8
511
-        movmi  v3, #0
512
-        cmp    v3, #255
513
-        movgt  v3, #255
506
+        aclip  v3, v3, ip, lsr #8
514 507
         orr    a2, a2, v3, lsl #8
515 508
         add    a4, a4, v4
516 509
         ldrh   ip, [v2, -v1]!
517 510
         strh   a2, [lr]
518
-        mov    a3, a3, asr #20
519 511
         and    a2, ip, #255
520
-        adds   a3, a3, a2
521
-        movmi  a3, #0
522
-        cmp    a3, #255
523
-        movgt  a3, #255
512
+        aclip  a3, a2, a3, asr #20
524 513
         mov    a4, a4, asr #20
525
-        adds   a4, a4, ip, lsr #8
526
-        movmi  a4, #0
527
-        cmp    a4, #255
528
-        movgt  a4, #255
514
+        aclip  a4, a4, ip, lsr #8
529 515
         orr    a2, a3, a4, lsl #8
530 516
         strh   a2, [v2]
531 517
 
532 518
         ldmfd  sp!, {a3, a4}
533 519
         ldrh   ip, [lr, v1]!
534 520
         add    a2, a3, v5
535
-        mov    a2, a2, asr #20
536 521
         sub    a3, a3, v5
537 522
         and    v3, ip, #255
538
-        adds   a2, a2, v3
539
-        movmi  a2, #0
540
-        cmp    a2, #255
541
-        movgt  a2, #255
523
+        aclip  a2, v3, a2, asr #20
542 524
         add    v3, a4, v6
543 525
         mov    v3, v3, asr #20
544
-        adds   v3, v3, ip, lsr #8
545
-        movmi  v3, #0
546
-        cmp    v3, #255
547
-        movgt  v3, #255
526
+        aclip  v3, v3, ip, lsr #8
548 527
         orr    a2, a2, v3, lsl #8
549 528
         sub    a4, a4, v6
550 529
         ldrh   ip, [v2, -v1]!
551 530
         strh   a2, [lr]
552
-        mov    a3, a3, asr #20
553 531
         and    a2, ip, #255
554
-        adds   a3, a3, a2
555
-        movmi  a3, #0
556
-        cmp    a3, #255
557
-        movgt  a3, #255
532
+        aclip  a3, a2, a3, asr #20
558 533
         mov    a4, a4, asr #20
559
-        adds   a4, a4, ip, lsr #8
560
-        movmi  a4, #0
561
-        cmp    a4, #255
562
-        movgt  a4, #255
534
+        aclip  a4, a4, ip, lsr #8
563 535
         orr    a2, a3, a4, lsl #8
564 536
         strh   a2, [v2]
565 537
 
566 538
         ldmfd  sp!, {a3, a4}
567 539
         ldrh   ip, [lr, v1]!
568 540
         add    a2, a3, v7
569
-        mov    a2, a2, asr #20
570 541
         sub    a3, a3, v7
571 542
         and    v3, ip, #255
572
-        adds   a2, a2, v3
573
-        movmi  a2, #0
574
-        cmp    a2, #255
575
-        movgt  a2, #255
543
+        aclip  a2, v3, a2, asr #20
576 544
         add    v3, a4, fp
577 545
         mov    v3, v3, asr #20
578
-        adds   v3, v3, ip, lsr #8
579
-        movmi  v3, #0
580
-        cmp    v3, #255
581
-        movgt  v3, #255
546
+        aclip  v3, v3, ip, lsr #8
582 547
         orr    a2, a2, v3, lsl #8
583 548
         sub    a4, a4, fp
584 549
         ldrh   ip, [v2, -v1]!
585 550
         strh   a2, [lr]
586
-        mov    a3, a3, asr #20
587 551
         and    a2, ip, #255
588
-        adds   a3, a3, a2
589
-        movmi  a3, #0
590
-        cmp    a3, #255
591
-        movgt  a3, #255
552
+        aclip  a3, a2, a3, asr #20
592 553
         mov    a4, a4, asr #20
593
-        adds   a4, a4, ip, lsr #8
594
-        movmi  a4, #0
595
-        cmp    a4, #255
596
-        movgt  a4, #255
554
+        aclip  a4, a4, ip, lsr #8
597 555
         orr    a2, a3, a4, lsl #8
598 556
         strh   a2, [v2]
599 557
 
... ...
@@ -505,7 +505,7 @@ typedef struct DSPContext {
505 505
 #define BASIS_SHIFT 16
506 506
 #define RECON_SHIFT 6
507 507
 
508
-    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w, int sides);
508
+    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides);
509 509
 #define EDGE_WIDTH 16
510 510
 #define EDGE_TOP    1
511 511
 #define EDGE_BOTTOM 2
... ...
@@ -79,7 +79,7 @@ static inline void FUNC(copy_block16)(uint8_t *dst, const uint8_t *src, int dstS
79 79
 
80 80
 /* draw the edges of width 'w' of an image of size width, height */
81 81
 //FIXME check that this is ok for mpeg4 interlaced
82
-static void FUNCC(draw_edges)(uint8_t *p_buf, int p_wrap, int width, int height, int w, int sides)
82
+static void FUNCC(draw_edges)(uint8_t *p_buf, int p_wrap, int width, int height, int w, int h, int sides)
83 83
 {
84 84
     pixel *buf = (pixel*)p_buf;
85 85
     int wrap = p_wrap / sizeof(pixel);
... ...
@@ -106,10 +106,10 @@ static void FUNCC(draw_edges)(uint8_t *p_buf, int p_wrap, int width, int height,
106 106
     buf -= w;
107 107
     last_line = buf + (height - 1) * wrap;
108 108
     if (sides & EDGE_TOP)
109
-        for(i = 0; i < w; i++)
109
+        for(i = 0; i < h; i++)
110 110
             memcpy(buf - (i + 1) * wrap, buf, (width + w + w) * sizeof(pixel)); // top
111 111
     if (sides & EDGE_BOTTOM)
112
-        for (i = 0; i < w; i++)
112
+        for (i = 0; i < h; i++)
113 113
             memcpy(last_line + (i + 1) * wrap, last_line, (width + w + w) * sizeof(pixel)); // bottom
114 114
 }
115 115
 
... ...
@@ -451,12 +451,13 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
451 451
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
452 452
                            int src_x_offset, int src_y_offset,
453 453
                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
454
-                           int pixel_shift){
454
+                           int pixel_shift, int chroma444){
455 455
     MpegEncContext * const s = &h->s;
456 456
     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
457 457
     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
458 458
     const int luma_xy= (mx&3) + ((my&3)<<2);
459
-    uint8_t * src_y = pic->data[0] + ((mx>>2) << pixel_shift) + (my>>2)*h->mb_linesize;
459
+    int offset = ((mx>>2) << pixel_shift) + (my>>2)*h->mb_linesize;
460
+    uint8_t * src_y = pic->data[0] + offset;
460 461
     uint8_t * src_cb, * src_cr;
461 462
     int extra_width= h->emu_edge_width;
462 463
     int extra_height= h->emu_edge_height;
... ...
@@ -485,6 +486,31 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
485 485
 
486 486
     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
487 487
 
488
+    if(chroma444){
489
+        src_cb = pic->data[1] + offset;
490
+        if(emu){
491
+            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
492
+                                    16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
493
+            src_cb= s->edge_emu_buffer + (2 << pixel_shift) + 2*h->mb_linesize;
494
+        }
495
+        qpix_op[luma_xy](dest_cb, src_cb, h->mb_linesize); //FIXME try variable height perhaps?
496
+        if(!square){
497
+            qpix_op[luma_xy](dest_cb + delta, src_cb + delta, h->mb_linesize);
498
+        }
499
+
500
+        src_cr = pic->data[2] + offset;
501
+        if(emu){
502
+            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
503
+                                    16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
504
+            src_cr= s->edge_emu_buffer + (2 << pixel_shift) + 2*h->mb_linesize;
505
+        }
506
+        qpix_op[luma_xy](dest_cr, src_cr, h->mb_linesize); //FIXME try variable height perhaps?
507
+        if(!square){
508
+            qpix_op[luma_xy](dest_cr + delta, src_cr + delta, h->mb_linesize);
509
+        }
510
+        return;
511
+    }
512
+
488 513
     if(MB_FIELD){
489 514
         // chroma offset when predicting from a field of opposite parity
490 515
         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
... ...
@@ -511,14 +537,19 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
511 511
                            int x_offset, int y_offset,
512 512
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
513 513
                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
514
-                           int list0, int list1, int pixel_shift){
514
+                           int list0, int list1, int pixel_shift, int chroma444){
515 515
     MpegEncContext * const s = &h->s;
516 516
     qpel_mc_func *qpix_op=  qpix_put;
517 517
     h264_chroma_mc_func chroma_op= chroma_put;
518 518
 
519
-    dest_y  += (2*x_offset << pixel_shift) + 2*y_offset*h->  mb_linesize;
520
-    dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
521
-    dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
519
+    dest_y  += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
520
+    if(chroma444){
521
+        dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
522
+        dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
523
+    }else{
524
+        dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
525
+        dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
526
+    }
522 527
     x_offset += 8*s->mb_x;
523 528
     y_offset += 8*(s->mb_y >> MB_FIELD);
524 529
 
... ...
@@ -526,7 +557,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
526 526
         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
527 527
         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
528 528
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
529
-                           qpix_op, chroma_op, pixel_shift);
529
+                           qpix_op, chroma_op, pixel_shift, chroma444);
530 530
 
531 531
         qpix_op=  qpix_avg;
532 532
         chroma_op= chroma_avg;
... ...
@@ -536,7 +567,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
536 536
         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
537 537
         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
538 538
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
539
-                           qpix_op, chroma_op, pixel_shift);
539
+                           qpix_op, chroma_op, pixel_shift, chroma444);
540 540
     }
541 541
 }
542 542
 
... ...
@@ -546,12 +577,19 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
546 546
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
547 547
                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
548 548
                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
549
-                           int list0, int list1, int pixel_shift){
549
+                           int list0, int list1, int pixel_shift, int chroma444){
550 550
     MpegEncContext * const s = &h->s;
551 551
 
552
-    dest_y  += (2*x_offset << pixel_shift) + 2*y_offset*h->  mb_linesize;
553
-    dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
554
-    dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
552
+    dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
553
+    if(chroma444){
554
+        chroma_weight_avg = luma_weight_avg;
555
+        chroma_weight_op = luma_weight_op;
556
+        dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
557
+        dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
558
+    }else{
559
+        dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
560
+        dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
561
+    }
555 562
     x_offset += 8*s->mb_x;
556 563
     y_offset += 8*(s->mb_y >> MB_FIELD);
557 564
 
... ...
@@ -559,17 +597,17 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
559 559
         /* don't optimize for luma-only case, since B-frames usually
560 560
          * use implicit weights => chroma too. */
561 561
         uint8_t *tmp_cb = s->obmc_scratchpad;
562
-        uint8_t *tmp_cr = s->obmc_scratchpad + (8 << pixel_shift);
563
-        uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
562
+        uint8_t *tmp_cr = s->obmc_scratchpad + (16 << pixel_shift);
563
+        uint8_t *tmp_y  = s->obmc_scratchpad + 16*h->mb_uvlinesize;
564 564
         int refn0 = h->ref_cache[0][ scan8[n] ];
565 565
         int refn1 = h->ref_cache[1][ scan8[n] ];
566 566
 
567 567
         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
568 568
                     dest_y, dest_cb, dest_cr,
569
-                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift);
569
+                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
570 570
         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
571 571
                     tmp_y, tmp_cb, tmp_cr,
572
-                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift);
572
+                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
573 573
 
574 574
         if(h->use_weight == 2){
575 575
             int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
... ...
@@ -594,7 +632,7 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
594 594
         Picture *ref= &h->ref_list[list][refn];
595 595
         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
596 596
                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
597
-                    qpix_put, chroma_put, pixel_shift);
597
+                    qpix_put, chroma_put, pixel_shift, chroma444);
598 598
 
599 599
         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
600 600
                        h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
... ...
@@ -613,21 +651,21 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
613 613
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
614 614
                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
615 615
                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
616
-                           int list0, int list1, int pixel_shift){
616
+                           int list0, int list1, int pixel_shift, int chroma444){
617 617
     if((h->use_weight==2 && list0 && list1
618 618
         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
619 619
        || h->use_weight==1)
620 620
         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
621 621
                          x_offset, y_offset, qpix_put, chroma_put,
622 622
                          weight_op[0], weight_op[3], weight_avg[0],
623
-                         weight_avg[3], list0, list1, pixel_shift);
623
+                         weight_avg[3], list0, list1, pixel_shift, chroma444);
624 624
     else
625 625
         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
626 626
                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
627
-                    chroma_avg, list0, list1, pixel_shift);
627
+                    chroma_avg, list0, list1, pixel_shift, chroma444);
628 628
 }
629 629
 
630
-static inline void prefetch_motion(H264Context *h, int list, int pixel_shift){
630
+static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma444){
631 631
     /* fetch pixels for estimated mv 4 macroblocks ahead
632 632
      * optimized for 64byte cache lines */
633 633
     MpegEncContext * const s = &h->s;
... ...
@@ -638,8 +676,13 @@ static inline void prefetch_motion(H264Context *h, int list, int pixel_shift){
638 638
         uint8_t **src= h->ref_list[list][refn].data;
639 639
         int off= ((mx+64)<<h->pixel_shift) + (my + (s->mb_x&3)*4)*h->mb_linesize;
640 640
         s->dsp.prefetch(src[0]+off, s->linesize, 4);
641
-        off= (((mx>>1)+64)<<h->pixel_shift) + ((my>>1) + (s->mb_x&7))*s->uvlinesize;
642
-        s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
641
+        if(chroma444){
642
+            s->dsp.prefetch(src[1]+off, s->linesize, 4);
643
+            s->dsp.prefetch(src[2]+off, s->linesize, 4);
644
+        }else{
645
+            off= (((mx>>1)+64)<<pixel_shift) + ((my>>1) + (s->mb_x&7))*s->uvlinesize;
646
+            s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
647
+        }
643 648
     }
644 649
 }
645 650
 
... ...
@@ -647,7 +690,7 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
647 647
                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
648 648
                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
649 649
                       h264_weight_func *weight_op, h264_biweight_func *weight_avg,
650
-                      int pixel_shift){
650
+                      int pixel_shift, int chroma444){
651 651
     MpegEncContext * const s = &h->s;
652 652
     const int mb_xy= h->mb_xy;
653 653
     const int mb_type= s->current_picture.mb_type[mb_xy];
... ...
@@ -656,36 +699,36 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
656 656
 
657 657
     if(HAVE_PTHREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME))
658 658
         await_references(h);
659
-    prefetch_motion(h, 0, pixel_shift);
659
+    prefetch_motion(h, 0, pixel_shift, chroma444);
660 660
 
661 661
     if(IS_16X16(mb_type)){
662 662
         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
663 663
                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
664 664
                 weight_op, weight_avg,
665 665
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
666
-                pixel_shift);
666
+                pixel_shift, chroma444);
667 667
     }else if(IS_16X8(mb_type)){
668 668
         mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
669 669
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
670 670
                 &weight_op[1], &weight_avg[1],
671 671
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
672
-                pixel_shift);
672
+                pixel_shift, chroma444);
673 673
         mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
674 674
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
675 675
                 &weight_op[1], &weight_avg[1],
676 676
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
677
-                pixel_shift);
677
+                pixel_shift, chroma444);
678 678
     }else if(IS_8X16(mb_type)){
679 679
         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
680 680
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
681 681
                 &weight_op[2], &weight_avg[2],
682 682
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
683
-                pixel_shift);
683
+                pixel_shift, chroma444);
684 684
         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
685 685
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
686 686
                 &weight_op[2], &weight_avg[2],
687 687
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
688
-                pixel_shift);
688
+                pixel_shift, chroma444);
689 689
     }else{
690 690
         int i;
691 691
 
... ...
@@ -702,29 +745,29 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
702 702
                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
703 703
                     &weight_op[3], &weight_avg[3],
704 704
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
705
-                    pixel_shift);
705
+                    pixel_shift, chroma444);
706 706
             }else if(IS_SUB_8X4(sub_mb_type)){
707 707
                 mc_part(h, n  , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
708 708
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
709 709
                     &weight_op[4], &weight_avg[4],
710 710
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
711
-                    pixel_shift);
711
+                    pixel_shift, chroma444);
712 712
                 mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
713 713
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
714 714
                     &weight_op[4], &weight_avg[4],
715 715
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
716
-                    pixel_shift);
716
+                    pixel_shift, chroma444);
717 717
             }else if(IS_SUB_4X8(sub_mb_type)){
718 718
                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
719 719
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
720 720
                     &weight_op[5], &weight_avg[5],
721 721
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
722
-                    pixel_shift);
722
+                    pixel_shift, chroma444);
723 723
                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
724 724
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
725 725
                     &weight_op[5], &weight_avg[5],
726 726
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
727
-                    pixel_shift);
727
+                    pixel_shift, chroma444);
728 728
             }else{
729 729
                 int j;
730 730
                 assert(IS_SUB_4X4(sub_mb_type));
... ...
@@ -735,13 +778,13 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
735 735
                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
736 736
                         &weight_op[6], &weight_avg[6],
737 737
                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
738
-                        pixel_shift);
738
+                        pixel_shift, chroma444);
739 739
                 }
740 740
             }
741 741
         }
742 742
     }
743 743
 
744
-    prefetch_motion(h, 1, pixel_shift);
744
+    prefetch_motion(h, 1, pixel_shift, chroma444);
745 745
 }
746 746
 
747 747
 #define hl_motion_fn(sh, bits) \
... ...
@@ -753,10 +796,11 @@ static av_always_inline void hl_motion_ ## bits(H264Context *h, \
753 753
                                        qpel_mc_func (*qpix_avg)[16], \
754 754
                                        h264_chroma_mc_func (*chroma_avg), \
755 755
                                        h264_weight_func *weight_op, \
756
-                                       h264_biweight_func *weight_avg) \
756
+                                       h264_biweight_func *weight_avg, \
757
+                                       int chroma444) \
757 758
 { \
758 759
     hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, \
759
-              qpix_avg, chroma_avg, weight_op, weight_avg, sh); \
760
+              qpix_avg, chroma_avg, weight_op, weight_avg, sh, chroma444); \
760 761
 }
761 762
 hl_motion_fn(0, 8);
762 763
 hl_motion_fn(1, 16);
... ...
@@ -796,16 +840,19 @@ static void free_tables(H264Context *h, int free_rbsp){
796 796
 }
797 797
 
798 798
 static void init_dequant8_coeff_table(H264Context *h){
799
-    int i,q,x;
799
+    int i,j,q,x;
800 800
     const int max_qp = 51 + 6*(h->sps.bit_depth_luma-8);
801
-    h->dequant8_coeff[0] = h->dequant8_buffer[0];
802
-    h->dequant8_coeff[1] = h->dequant8_buffer[1];
803 801
 
804
-    for(i=0; i<2; i++ ){
805
-        if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
806
-            h->dequant8_coeff[1] = h->dequant8_buffer[0];
807
-            break;
802
+    for(i=0; i<6; i++ ){
803
+        h->dequant8_coeff[i] = h->dequant8_buffer[i];
804
+        for(j=0; j<i; j++){
805
+            if(!memcmp(h->pps.scaling_matrix8[j], h->pps.scaling_matrix8[i], 64*sizeof(uint8_t))){
806
+                h->dequant8_coeff[i] = h->dequant8_buffer[j];
807
+                break;
808
+            }
808 809
         }
810
+        if(j<i)
811
+            continue;
809 812
 
810 813
         for(q=0; q<max_qp+1; q++){
811 814
             int shift = div6[q];
... ...
@@ -853,7 +900,7 @@ static void init_dequant_tables(H264Context *h){
853 853
             for(x=0; x<16; x++)
854 854
                 h->dequant4_coeff[i][0][x] = 1<<6;
855 855
         if(h->pps.transform_8x8_mode)
856
-            for(i=0; i<2; i++)
856
+            for(i=0; i<6; i++)
857 857
                 for(x=0; x<64; x++)
858 858
                     h->dequant8_coeff[i][0][x] = 1<<6;
859 859
     }
... ...
@@ -868,7 +915,7 @@ int ff_h264_alloc_tables(H264Context *h){
868 868
 
869 869
     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, row_mb_num * 8  * sizeof(uint8_t), fail)
870 870
 
871
-    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 32 * sizeof(uint8_t), fail)
871
+    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 48 * sizeof(uint8_t), fail)
872 872
     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail)
873 873
     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)
874 874
 
... ...
@@ -930,8 +977,8 @@ static void clone_tables(H264Context *dst, H264Context *src, int i){
930 930
  * Allocate buffers which are not shared amongst multiple threads.
931 931
  */
932 932
 static int context_init(H264Context *h){
933
-    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t)*2, fail)
934
-    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t)*2, fail)
933
+    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * 16*3 * sizeof(uint8_t)*2, fail)
934
+    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * 16*3 * sizeof(uint8_t)*2, fail)
935 935
 
936 936
     h->ref_cache[0][scan8[5 ]+1] = h->ref_cache[0][scan8[7 ]+1] = h->ref_cache[0][scan8[13]+1] =
937 937
     h->ref_cache[1][scan8[5 ]+1] = h->ref_cache[1][scan8[7 ]+1] = h->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
... ...
@@ -1130,9 +1177,10 @@ static int decode_update_thread_context(AVCodecContext *dst, const AVCodecContex
1130 1130
 
1131 1131
         // frame_start may not be called for the next thread (if it's decoding a bottom field)
1132 1132
         // so this has to be allocated here
1133
-        h->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
1133
+        h->s.obmc_scratchpad = av_malloc(16*6*s->linesize);
1134 1134
 
1135 1135
         s->dsp.clear_blocks(h->mb);
1136
+        s->dsp.clear_blocks(h->mb+(24*16<<h->pixel_shift));
1136 1137
     }
1137 1138
 
1138 1139
     //extradata/NAL handling
... ...
@@ -1151,7 +1199,7 @@ static int decode_update_thread_context(AVCodecContext *dst, const AVCodecContex
1151 1151
     for(i=0; i<6; i++)
1152 1152
         h->dequant4_coeff[i] = h->dequant4_buffer[0] + (h1->dequant4_coeff[i] - h1->dequant4_buffer[0]);
1153 1153
 
1154
-    for(i=0; i<2; i++)
1154
+    for(i=0; i<6; i++)
1155 1155
         h->dequant8_coeff[i] = h->dequant8_buffer[0] + (h1->dequant8_coeff[i] - h1->dequant8_buffer[0]);
1156 1156
 
1157 1157
     h->dequant_coeff_pps = h1->dequant_coeff_pps;
... ...
@@ -1206,20 +1254,20 @@ int ff_h264_frame_start(H264Context *h){
1206 1206
 
1207 1207
     for(i=0; i<16; i++){
1208 1208
         h->block_offset[i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
1209
-        h->block_offset[24+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
1209
+        h->block_offset[48+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
1210 1210
     }
1211
-    for(i=0; i<4; i++){
1211
+    for(i=0; i<16; i++){
1212 1212
         h->block_offset[16+i]=
1213
-        h->block_offset[20+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
1214
-        h->block_offset[24+16+i]=
1215
-        h->block_offset[24+20+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
1213
+        h->block_offset[32+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
1214
+        h->block_offset[48+16+i]=
1215
+        h->block_offset[48+32+i]= (4*((scan8[i] - scan8[0])&7) << pixel_shift) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
1216 1216
     }
1217 1217
 
1218 1218
     /* can't be in alloc_tables because linesize isn't known there.
1219 1219
      * FIXME: redo bipred weight to not require extra buffer? */
1220 1220
     for(i = 0; i < thread_count; i++)
1221 1221
         if(h->thread_context[i] && !h->thread_context[i]->s.obmc_scratchpad)
1222
-            h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
1222
+            h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*6*s->linesize);
1223 1223
 
1224 1224
     /* some macroblocks can be accessed before they're available in case of lost slices, mbaff or threading*/
1225 1225
     memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
... ...
@@ -1404,7 +1452,7 @@ static void decode_postinit(H264Context *h, int setup_finished){
1404 1404
         ff_thread_finish_setup(s->avctx);
1405 1405
 }
1406 1406
 
1407
-static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
1407
+static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int chroma444, int simple){
1408 1408
     MpegEncContext * const s = &h->s;
1409 1409
     uint8_t *top_border;
1410 1410
     int top_idx = 1;
... ...
@@ -1422,12 +1470,24 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
1422 1422
                 if (pixel_shift)
1423 1423
                     AV_COPY128(top_border+16, src_y+15*linesize+16);
1424 1424
                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
1425
-                    if (pixel_shift) {
1426
-                        AV_COPY128(top_border+32, src_cb+7*uvlinesize);
1427
-                        AV_COPY128(top_border+48, src_cr+7*uvlinesize);
1425
+                    if(chroma444){
1426
+                        if (pixel_shift){
1427
+                            AV_COPY128(top_border+32, src_cb + 15*uvlinesize);
1428
+                            AV_COPY128(top_border+48, src_cb + 15*uvlinesize+16);
1429
+                            AV_COPY128(top_border+64, src_cr + 15*uvlinesize);
1430
+                            AV_COPY128(top_border+80, src_cr + 15*uvlinesize+16);
1431
+                        } else {
1432
+                            AV_COPY128(top_border+16, src_cb + 15*uvlinesize);
1433
+                            AV_COPY128(top_border+32, src_cr + 15*uvlinesize);
1434
+                        }
1428 1435
                     } else {
1429
-                    AV_COPY64(top_border+16, src_cb+7*uvlinesize);
1430
-                    AV_COPY64(top_border+24, src_cr+7*uvlinesize);
1436
+                        if (pixel_shift) {
1437
+                            AV_COPY128(top_border+32, src_cb+7*uvlinesize);
1438
+                            AV_COPY128(top_border+48, src_cr+7*uvlinesize);
1439
+                        } else {
1440
+                            AV_COPY64(top_border+16, src_cb+7*uvlinesize);
1441
+                            AV_COPY64(top_border+24, src_cr+7*uvlinesize);
1442
+                        }
1431 1443
                     }
1432 1444
                 }
1433 1445
             }
... ...
@@ -1445,12 +1505,24 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
1445 1445
         AV_COPY128(top_border+16, src_y+16*linesize+16);
1446 1446
 
1447 1447
     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
1448
-        if (pixel_shift) {
1449
-            AV_COPY128(top_border+32, src_cb+8*uvlinesize);
1450
-            AV_COPY128(top_border+48, src_cr+8*uvlinesize);
1448
+        if(chroma444){
1449
+            if (pixel_shift){
1450
+                AV_COPY128(top_border+32, src_cb + 16*linesize);
1451
+                AV_COPY128(top_border+48, src_cb + 16*linesize+16);
1452
+                AV_COPY128(top_border+64, src_cr + 16*linesize);
1453
+                AV_COPY128(top_border+80, src_cr + 16*linesize+16);
1454
+            } else {
1455
+                AV_COPY128(top_border+16, src_cb + 16*linesize);
1456
+                AV_COPY128(top_border+32, src_cr + 16*linesize);
1457
+            }
1451 1458
         } else {
1452
-        AV_COPY64(top_border+16, src_cb+8*uvlinesize);
1453
-        AV_COPY64(top_border+24, src_cr+8*uvlinesize);
1459
+            if (pixel_shift) {
1460
+                AV_COPY128(top_border+32, src_cb+8*uvlinesize);
1461
+                AV_COPY128(top_border+48, src_cr+8*uvlinesize);
1462
+            } else {
1463
+                AV_COPY64(top_border+16, src_cb+8*uvlinesize);
1464
+                AV_COPY64(top_border+24, src_cr+8*uvlinesize);
1465
+            }
1454 1466
         }
1455 1467
     }
1456 1468
 }
... ...
@@ -1458,7 +1530,8 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
1458 1458
 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y,
1459 1459
                                   uint8_t *src_cb, uint8_t *src_cr,
1460 1460
                                   int linesize, int uvlinesize,
1461
-                                  int xchg, int simple, int pixel_shift){
1461
+                                  int xchg, int chroma444,
1462
+                                  int simple, int pixel_shift){
1462 1463
     MpegEncContext * const s = &h->s;
1463 1464
     int deblock_topleft;
1464 1465
     int deblock_top;
... ...
@@ -1513,13 +1586,28 @@ else      AV_COPY64(b,a);
1513 1513
         }
1514 1514
     }
1515 1515
     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
1516
-        if(deblock_top){
1516
+        if(chroma444){
1517 1517
             if(deblock_topleft){
1518
-                XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
1519
-                XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
1518
+                XCHG(top_border_m1 + (24 << pixel_shift), src_cb - (7 << pixel_shift), 1);
1519
+                XCHG(top_border_m1 + (40 << pixel_shift), src_cr - (7 << pixel_shift), 1);
1520
+            }
1521
+            XCHG(top_border + (16 << pixel_shift), src_cb + (1 << pixel_shift), xchg);
1522
+            XCHG(top_border + (24 << pixel_shift), src_cb + (9 << pixel_shift), 1);
1523
+            XCHG(top_border + (32 << pixel_shift), src_cr + (1 << pixel_shift), xchg);
1524
+            XCHG(top_border + (40 << pixel_shift), src_cr + (9 << pixel_shift), 1);
1525
+            if(s->mb_x+1 < s->mb_width){
1526
+                XCHG(h->top_borders[top_idx][s->mb_x+1] + (16 << pixel_shift), src_cb + (17 << pixel_shift), 1);
1527
+                XCHG(h->top_borders[top_idx][s->mb_x+1] + (32 << pixel_shift), src_cr + (17 << pixel_shift), 1);
1528
+            }
1529
+        } else {
1530
+            if(deblock_top){
1531
+                if(deblock_topleft){
1532
+                    XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
1533
+                    XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
1534
+                }
1535
+                XCHG(top_border + (16 << pixel_shift), src_cb+1+pixel_shift, 1);
1536
+                XCHG(top_border + (24 << pixel_shift), src_cr+1+pixel_shift, 1);
1520 1537
             }
1521
-            XCHG(top_border + (16 << pixel_shift), src_cb+1+pixel_shift, 1);
1522
-            XCHG(top_border + (24 << pixel_shift), src_cr+1+pixel_shift, 1);
1523 1538
         }
1524 1539
     }
1525 1540
 }
... ...
@@ -1538,6 +1626,159 @@ static av_always_inline void dctcoef_set(DCTELEM *mb, int high_bit_depth, int in
1538 1538
         AV_WN16A(mb + index, value);
1539 1539
 }
1540 1540
 
1541
+static av_always_inline void hl_decode_mb_predict_luma(H264Context *h, int mb_type, int is_h264, int simple, int transform_bypass,
1542
+                                                       int pixel_shift, int *block_offset, int linesize, uint8_t *dest_y, int p)
1543
+{
1544
+    MpegEncContext * const s = &h->s;
1545
+    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
1546
+    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
1547
+    int i;
1548
+    int qscale = p == 0 ? s->qscale : h->chroma_qp[p-1];
1549
+    block_offset += 16*p;
1550
+    if(IS_INTRA4x4(mb_type)){
1551
+        if(simple || !s->encoding){
1552
+            if(IS_8x8DCT(mb_type)){
1553
+                if(transform_bypass){
1554
+                    idct_dc_add =
1555
+                    idct_add    = s->dsp.add_pixels8;
1556
+                }else{
1557
+                    idct_dc_add = h->h264dsp.h264_idct8_dc_add;
1558
+                    idct_add    = h->h264dsp.h264_idct8_add;
1559
+                }
1560
+                for(i=0; i<16; i+=4){
1561
+                    uint8_t * const ptr= dest_y + block_offset[i];
1562
+                    const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
1563
+                    if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
1564
+                        h->hpc.pred8x8l_add[dir](ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
1565
+                    }else{
1566
+                        const int nnz = h->non_zero_count_cache[ scan8[i+p*16] ];
1567
+                        h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
1568
+                                                    (h->topright_samples_available<<i)&0x4000, linesize);
1569
+                        if(nnz){
1570
+                            if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16+p*256))
1571
+                                idct_dc_add(ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
1572
+                            else
1573
+                                idct_add   (ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
1574
+                        }
1575
+                    }
1576
+                }
1577
+            }else{
1578
+                if(transform_bypass){
1579
+                    idct_dc_add =
1580
+                    idct_add    = s->dsp.add_pixels4;
1581
+                }else{
1582
+                    idct_dc_add = h->h264dsp.h264_idct_dc_add;
1583
+                    idct_add    = h->h264dsp.h264_idct_add;
1584
+                }
1585
+                for(i=0; i<16; i++){
1586
+                    uint8_t * const ptr= dest_y + block_offset[i];
1587
+                    const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
1588
+
1589
+                    if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
1590
+                        h->hpc.pred4x4_add[dir](ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
1591
+                    }else{
1592
+                        uint8_t *topright;
1593
+                        int nnz, tr;
1594
+                        uint64_t tr_high;
1595
+                        if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
1596
+                            const int topright_avail= (h->topright_samples_available<<i)&0x8000;
1597
+                            assert(mb_y || linesize <= block_offset[i]);
1598
+                            if(!topright_avail){
1599
+                                if (pixel_shift) {
1600
+                                    tr_high= ((uint16_t*)ptr)[3 - linesize/2]*0x0001000100010001ULL;
1601
+                                    topright= (uint8_t*) &tr_high;
1602
+                                } else {
1603
+                                    tr= ptr[3 - linesize]*0x01010101;
1604
+                                    topright= (uint8_t*) &tr;
1605
+                                }
1606
+                            }else
1607
+                                topright= ptr + (4 << pixel_shift) - linesize;
1608
+                        }else
1609
+                            topright= NULL;
1610
+
1611
+                        h->hpc.pred4x4[ dir ](ptr, topright, linesize);
1612
+                        nnz = h->non_zero_count_cache[ scan8[i+p*16] ];
1613
+                        if(nnz){
1614
+                            if(is_h264){
1615
+                                if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16+p*256))
1616
+                                    idct_dc_add(ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
1617
+                                else
1618
+                                    idct_add   (ptr, h->mb + (i*16+p*256 << pixel_shift), linesize);
1619
+                            }else
1620
+                                ff_svq3_add_idct_c(ptr, h->mb + i*16+p*256, linesize, qscale, 0);
1621
+                        }
1622
+                    }
1623
+                }
1624
+            }
1625
+        }
1626
+    }else{
1627
+        h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
1628
+        if(is_h264){
1629
+            if(h->non_zero_count_cache[ scan8[LUMA_DC_BLOCK_INDEX+p] ]){
1630
+                if(!transform_bypass)
1631
+                    h->h264dsp.h264_luma_dc_dequant_idct(h->mb+(p*256 << pixel_shift), h->mb_luma_dc[p], h->dequant4_coeff[p][qscale][0]);
1632
+                else{
1633
+                    static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
1634
+                                                            8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
1635
+                    for(i = 0; i < 16; i++)
1636
+                        dctcoef_set(h->mb+p*256, pixel_shift, dc_mapping[i], dctcoef_get(h->mb_luma_dc[p], pixel_shift, i));
1637
+                }
1638
+            }
1639
+        }else
1640
+            ff_svq3_luma_dc_dequant_idct_c(h->mb+p*256, h->mb_luma_dc[p], qscale);
1641
+    }
1642
+}
1643
+
1644
+static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, int is_h264, int simple, int transform_bypass,
1645
+                                                    int pixel_shift, int *block_offset, int linesize, uint8_t *dest_y, int p)
1646
+{
1647
+    MpegEncContext * const s = &h->s;
1648
+    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
1649
+    int i;
1650
+    block_offset += 16*p;
1651
+    if(!IS_INTRA4x4(mb_type)){
1652
+        if(is_h264){
1653
+            if(IS_INTRA16x16(mb_type)){
1654
+                if(transform_bypass){
1655
+                    if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
1656
+                        h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb + (p*256 << pixel_shift), linesize);
1657
+                    }else{
1658
+                        for(i=0; i<16; i++){
1659
+                            if(h->non_zero_count_cache[ scan8[i+p*16] ] || dctcoef_get(h->mb, pixel_shift, i*16))
1660
+                                s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + (i*16+p*256 << pixel_shift), linesize);
1661
+                        }
1662
+                    }
1663
+                }else{
1664
+                    h->h264dsp.h264_idct_add16intra(dest_y, block_offset, h->mb + (p*256 << pixel_shift), linesize, h->non_zero_count_cache+p*5*8);
1665
+                }
1666
+            }else if(h->cbp&15){
1667
+                if(transform_bypass){
1668
+                    const int di = IS_8x8DCT(mb_type) ? 4 : 1;
1669
+                    idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
1670
+                    for(i=0; i<16; i+=di){
1671
+                        if(h->non_zero_count_cache[ scan8[i+p*16] ]){
1672
+                            idct_add(dest_y + block_offset[i], h->mb + (i*16+p*256 << pixel_shift), linesize);
1673
+                        }
1674
+                    }
1675
+                }else{
1676
+                    if(IS_8x8DCT(mb_type)){
1677
+                        h->h264dsp.h264_idct8_add4(dest_y, block_offset, h->mb + (p*256 << pixel_shift), linesize, h->non_zero_count_cache+p*5*8);
1678
+                    }else{
1679
+                        h->h264dsp.h264_idct_add16(dest_y, block_offset, h->mb + (p*256 << pixel_shift), linesize, h->non_zero_count_cache+p*5*8);
1680
+                    }
1681
+                }
1682
+            }
1683
+        }else{
1684
+            for(i=0; i<16; i++){
1685
+                if(h->non_zero_count_cache[ scan8[i+p*16] ] || h->mb[i*16+p*256]){ //FIXME benchmark weird rule, & below
1686
+                    uint8_t * const ptr= dest_y + block_offset[i];
1687
+                    ff_svq3_add_idct_c(ptr, h->mb + i*16 + p*256, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
1688
+                }
1689
+            }
1690
+        }
1691
+    }
1692
+}
1693
+
1541 1694
 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift){
1542 1695
     MpegEncContext * const s = &h->s;
1543 1696
     const int mb_x= s->mb_x;
... ...
@@ -1546,13 +1787,12 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
1546 1546
     const int mb_type= s->current_picture.mb_type[mb_xy];
1547 1547
     uint8_t  *dest_y, *dest_cb, *dest_cr;
1548 1548
     int linesize, uvlinesize /*dct_offset*/;
1549
-    int i;
1549
+    int i, j;
1550 1550
     int *block_offset = &h->block_offset[0];
1551 1551
     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
1552 1552
     /* is_h264 should always be true if SVQ3 is disabled. */
1553 1553
     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
1554 1554
     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
1555
-    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
1556 1555
 
1557 1556
     dest_y  = s->current_picture.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
1558 1557
     dest_cb = s->current_picture.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
... ...
@@ -1566,7 +1806,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
1566 1566
     if (!simple && MB_FIELD) {
1567 1567
         linesize   = h->mb_linesize   = s->linesize * 2;
1568 1568
         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
1569
-        block_offset = &h->block_offset[24];
1569
+        block_offset = &h->block_offset[48];
1570 1570
         if(mb_y&1){ //FIXME move out of this function?
1571 1571
             dest_y -= s->linesize*15;
1572 1572
             dest_cb-= s->uvlinesize*7;
... ...
@@ -1607,227 +1847,95 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
1607 1607
                 for (j = 0; j < 16; j++)
1608 1608
                     tmp_y[j] = get_bits(&gb, bit_depth);
1609 1609
             }
1610
-            for (i = 0; i < 8; i++) {
1611
-                uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize);
1612
-                for (j = 0; j < 8; j++)
1613
-                    tmp_cb[j] = get_bits(&gb, bit_depth);
1614
-            }
1615
-            for (i = 0; i < 8; i++) {
1616
-                uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize);
1617
-                for (j = 0; j < 8; j++)
1618
-                    tmp_cr[j] = get_bits(&gb, bit_depth);
1610
+            if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
1611
+                for (i = 0; i < 8; i++) {
1612
+                    uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize);
1613
+                    for (j = 0; j < 8; j++)
1614
+                        tmp_cb[j] = get_bits(&gb, bit_depth);
1615
+                }
1616
+                for (i = 0; i < 8; i++) {
1617
+                    uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize);
1618
+                    for (j = 0; j < 8; j++)
1619
+                        tmp_cr[j] = get_bits(&gb, bit_depth);
1620
+                }
1619 1621
             }
1620 1622
         } else {
1621
-        for (i=0; i<16; i++) {
1622
-            memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
1623
-        }
1624
-        for (i=0; i<8; i++) {
1625
-            memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
1626
-            memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
1627
-        }
1623
+            for (i=0; i<16; i++) {
1624
+                memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
1625
+            }
1626
+            if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
1627
+                for (i=0; i<8; i++) {
1628
+                    memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
1629
+                    memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
1630
+                }
1631
+            }
1628 1632
         }
1629 1633
     } else {
1630 1634
         if(IS_INTRA(mb_type)){
1631 1635
             if(h->deblocking_filter)
1632
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple, pixel_shift);
1636
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, 0, simple, pixel_shift);
1633 1637
 
1634 1638
             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
1635 1639
                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
1636 1640
                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
1637 1641
             }
1638 1642
 
1639
-            if(IS_INTRA4x4(mb_type)){
1640
-                if(simple || !s->encoding){
1641
-                    if(IS_8x8DCT(mb_type)){
1642
-                        if(transform_bypass){
1643
-                            idct_dc_add =
1644
-                            idct_add    = s->dsp.add_pixels8;
1645
-                        }else{
1646
-                            idct_dc_add = h->h264dsp.h264_idct8_dc_add;
1647
-                            idct_add    = h->h264dsp.h264_idct8_add;
1648
-                        }
1649
-                        for(i=0; i<16; i+=4){
1650
-                            uint8_t * const ptr= dest_y + block_offset[i];
1651
-                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
1652
-                            if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
1653
-                                h->hpc.pred8x8l_add[dir](ptr, h->mb + (i*16 << pixel_shift), linesize);
1654
-                            }else{
1655
-                                const int nnz = h->non_zero_count_cache[ scan8[i] ];
1656
-                                h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
1657
-                                                            (h->topright_samples_available<<i)&0x4000, linesize);
1658
-                                if(nnz){
1659
-                                    if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16))
1660
-                                        idct_dc_add(ptr, h->mb + (i*16 << pixel_shift), linesize);
1661
-                                    else
1662
-                                        idct_add   (ptr, h->mb + (i*16 << pixel_shift), linesize);
1663
-                                }
1664
-                            }
1665
-                        }
1666
-                    }else{
1667
-                        if(transform_bypass){
1668
-                            idct_dc_add =
1669
-                            idct_add    = s->dsp.add_pixels4;
1670
-                        }else{
1671
-                            idct_dc_add = h->h264dsp.h264_idct_dc_add;
1672
-                            idct_add    = h->h264dsp.h264_idct_add;
1673
-                        }
1674
-                        for(i=0; i<16; i++){
1675
-                            uint8_t * const ptr= dest_y + block_offset[i];
1676
-                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
1677
-
1678
-                            if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
1679
-                                h->hpc.pred4x4_add[dir](ptr, h->mb + (i*16 << pixel_shift), linesize);
1680
-                            }else{
1681
-                                uint8_t *topright;
1682
-                                int nnz, tr;
1683
-                                uint64_t tr_high;
1684
-                                if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
1685
-                                    const int topright_avail= (h->topright_samples_available<<i)&0x8000;
1686
-                                    assert(mb_y || linesize <= block_offset[i]);
1687
-                                    if(!topright_avail){
1688
-                                        if (pixel_shift) {
1689
-                                            tr_high= ((uint16_t*)ptr)[3 - linesize/2]*0x0001000100010001ULL;
1690
-                                            topright= (uint8_t*) &tr_high;
1691
-                                        } else {
1692
-                                        tr= ptr[3 - linesize]*0x01010101;
1693
-                                        topright= (uint8_t*) &tr;
1694
-                                        }
1695
-                                    }else
1696
-                                        topright= ptr + (4 << pixel_shift) - linesize;
1697
-                                }else
1698
-                                    topright= NULL;
1699
-
1700
-                                h->hpc.pred4x4[ dir ](ptr, topright, linesize);
1701
-                                nnz = h->non_zero_count_cache[ scan8[i] ];
1702
-                                if(nnz){
1703
-                                    if(is_h264){
1704
-                                        if(nnz == 1 && dctcoef_get(h->mb, pixel_shift, i*16))
1705
-                                            idct_dc_add(ptr, h->mb + (i*16 << pixel_shift), linesize);
1706
-                                        else
1707
-                                            idct_add   (ptr, h->mb + (i*16<<pixel_shift), linesize);
1708
-                                    }
1709
-#if CONFIG_SVQ3_DECODER
1710
-                                    else
1711
-                                        ff_svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
1712
-#endif
1713
-                                }
1714
-                            }
1715
-                        }
1716
-                    }
1717
-                }
1718
-            }else{
1719
-                h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
1720
-                if(is_h264){
1721
-                    if(h->non_zero_count_cache[ scan8[LUMA_DC_BLOCK_INDEX] ]){
1722
-                        if(!transform_bypass)
1723
-                            h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]);
1724
-                        else{
1725
-                            static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
1726
-                                                                    8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
1727
-                            for(i = 0; i < 16; i++)
1728
-                                dctcoef_set(h->mb, pixel_shift, dc_mapping[i], dctcoef_get(h->mb_luma_dc, pixel_shift, i));
1729
-                        }
1730
-                    }
1731
-                }
1732
-#if CONFIG_SVQ3_DECODER
1733
-                else
1734
-                    ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale);
1735
-#endif
1736
-            }
1643
+            hl_decode_mb_predict_luma(h, mb_type, is_h264, simple, transform_bypass, pixel_shift, block_offset, linesize, dest_y, 0);
1644
+
1737 1645
             if(h->deblocking_filter)
1738
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple, pixel_shift);
1646
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, 0, simple, pixel_shift);
1739 1647
         }else if(is_h264){
1740 1648
             if (pixel_shift) {
1741 1649
                 hl_motion_16(h, dest_y, dest_cb, dest_cr,
1742 1650
                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
1743 1651
                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
1744 1652
                              h->h264dsp.weight_h264_pixels_tab,
1745
-                             h->h264dsp.biweight_h264_pixels_tab);
1653
+                             h->h264dsp.biweight_h264_pixels_tab, 0);
1746 1654
             } else
1747 1655
                 hl_motion_8(h, dest_y, dest_cb, dest_cr,
1748 1656
                             s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
1749 1657
                             s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
1750 1658
                             h->h264dsp.weight_h264_pixels_tab,
1751
-                            h->h264dsp.biweight_h264_pixels_tab);
1659
+                            h->h264dsp.biweight_h264_pixels_tab, 0);
1752 1660
         }
1753 1661
 
1754
-
1755
-        if(!IS_INTRA4x4(mb_type)){
1756
-            if(is_h264){
1757
-                if(IS_INTRA16x16(mb_type)){
1758
-                    if(transform_bypass){
1759
-                        if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
1760
-                            h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
1761
-                        }else{
1762
-                            for(i=0; i<16; i++){
1763
-                                if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
1764
-                                    s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + (i*16 << pixel_shift), linesize);
1765
-                            }
1766
-                        }
1767
-                    }else{
1768
-                         h->h264dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
1769
-                    }
1770
-                }else if(h->cbp&15){
1771
-                    if(transform_bypass){
1772
-                        const int di = IS_8x8DCT(mb_type) ? 4 : 1;
1773
-                        idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
1774
-                        for(i=0; i<16; i+=di){
1775
-                            if(h->non_zero_count_cache[ scan8[i] ]){
1776
-                                idct_add(dest_y + block_offset[i], h->mb + (i*16 << pixel_shift), linesize);
1777
-                            }
1778
-                        }
1779
-                    }else{
1780
-                        if(IS_8x8DCT(mb_type)){
1781
-                            h->h264dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
1782
-                        }else{
1783
-                            h->h264dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
1784
-                        }
1785
-                    }
1786
-                }
1787
-            }
1788
-#if CONFIG_SVQ3_DECODER
1789
-            else{
1790
-                for(i=0; i<16; i++){
1791
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
1792
-                        uint8_t * const ptr= dest_y + block_offset[i];
1793
-                        ff_svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
1794
-                    }
1795
-                }
1796
-            }
1797
-#endif
1798
-        }
1662
+        hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass, pixel_shift, block_offset, linesize, dest_y, 0);
1799 1663
 
1800 1664
         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
1801 1665
             uint8_t *dest[2] = {dest_cb, dest_cr};
1802 1666
             if(transform_bypass){
1803 1667
                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
1804
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + (16*16 << pixel_shift), uvlinesize);
1805
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + (20*16 << pixel_shift), uvlinesize);
1668
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + (16*16*1 << pixel_shift), uvlinesize);
1669
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 32, h->mb + (16*16*2 << pixel_shift), uvlinesize);
1806 1670
                 }else{
1807 1671
                     idct_add = s->dsp.add_pixels4;
1808
-                    for(i=16; i<16+8; i++){
1809
-                        if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
1810
-                            idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize);
1672
+                    for(j=1; j<3; j++){
1673
+                        for(i=j*16; i<j*16+4; i++){
1674
+                            if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
1675
+                                idct_add   (dest[j-1] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize);
1676
+                        }
1811 1677
                     }
1812 1678
                 }
1813 1679
             }else{
1814 1680
                 if(is_h264){
1815 1681
                     if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+0] ])
1816
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16 << pixel_shift)       , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
1682
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*1 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
1817 1683
                     if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+1] ])
1818
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + ((16*16+4*16) << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
1684
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16*16*2 << pixel_shift), h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
1819 1685
                     h->h264dsp.h264_idct_add8(dest, block_offset,
1820 1686
                                               h->mb, uvlinesize,
1821 1687
                                               h->non_zero_count_cache);
1822 1688
                 }
1823 1689
 #if CONFIG_SVQ3_DECODER
1824 1690
                 else{
1825
-                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16     , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
1826
-                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16+4*16, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
1827
-                    for(i=16; i<16+8; i++){
1828
-                        if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
1829
-                            uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
1830
-                            ff_svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, ff_h264_chroma_qp[0][s->qscale + 12] - 12, 2);
1691
+                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16*1, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
1692
+                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16*2, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
1693
+                    for(j=1; j<3; j++){
1694
+                        for(i=j*16; i<j*16+4; i++){
1695
+                            if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
1696
+                                uint8_t * const ptr= dest[j-1] + block_offset[i];
1697
+                                ff_svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, ff_h264_chroma_qp[0][s->qscale + 12] - 12, 2);
1698
+                            }
1831 1699
                         }
1832 1700
                     }
1833 1701
                 }
... ...
@@ -1836,7 +1944,113 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
1836 1836
         }
1837 1837
     }
1838 1838
     if(h->cbp || IS_INTRA(mb_type))
1839
+    {
1840
+        s->dsp.clear_blocks(h->mb);
1841
+        s->dsp.clear_blocks(h->mb+(24*16<<pixel_shift));
1842
+    }
1843
+}
1844
+
1845
+static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simple, int pixel_shift){
1846
+    MpegEncContext * const s = &h->s;
1847
+    const int mb_x= s->mb_x;
1848
+    const int mb_y= s->mb_y;
1849
+    const int mb_xy= h->mb_xy;
1850
+    const int mb_type= s->current_picture.mb_type[mb_xy];
1851
+    uint8_t  *dest[3];
1852
+    int linesize;
1853
+    int i, j, p;
1854
+    int *block_offset = &h->block_offset[0];
1855
+    const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
1856
+    const int plane_count = (simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) ? 3 : 1;
1857
+
1858
+    for (p = 0; p < plane_count; p++)
1859
+    {
1860
+        dest[p] = s->current_picture.data[p] + ((mb_x << pixel_shift) + mb_y * s->linesize) * 16;
1861
+        s->dsp.prefetch(dest[p] + (s->mb_x&3)*4*s->linesize + (64 << pixel_shift), s->linesize, 4);
1862
+    }
1863
+
1864
+    h->list_counts[mb_xy]= h->list_count;
1865
+
1866
+    if (!simple && MB_FIELD) {
1867
+        linesize   = h->mb_linesize = h->mb_uvlinesize = s->linesize * 2;
1868
+        block_offset = &h->block_offset[48];
1869
+        if(mb_y&1) //FIXME move out of this function?
1870
+            for (p = 0; p < 3; p++)
1871
+                dest[p] -= s->linesize*15;
1872
+        if(FRAME_MBAFF) {
1873
+            int list;
1874
+            for(list=0; list<h->list_count; list++){
1875
+                if(!USES_LIST(mb_type, list))
1876
+                    continue;
1877
+                if(IS_16X16(mb_type)){
1878
+                    int8_t *ref = &h->ref_cache[list][scan8[0]];
1879
+                    fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
1880
+                }else{
1881
+                    for(i=0; i<16; i+=4){
1882
+                        int ref = h->ref_cache[list][scan8[i]];
1883
+                        if(ref >= 0)
1884
+                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
1885
+                    }
1886
+                }
1887
+            }
1888
+        }
1889
+    } else {
1890
+        linesize   = h->mb_linesize = h->mb_uvlinesize = s->linesize;
1891
+    }
1892
+
1893
+    if (!simple && IS_INTRA_PCM(mb_type)) {
1894
+        if (pixel_shift) {
1895
+            const int bit_depth = h->sps.bit_depth_luma;
1896
+            GetBitContext gb;
1897
+            init_get_bits(&gb, (uint8_t*)h->mb, 768*bit_depth);
1898
+
1899
+            for (p = 0; p < plane_count; p++) {
1900
+                for (i = 0; i < 16; i++) {
1901
+                    uint16_t *tmp = (uint16_t*)(dest[p] + i*linesize);
1902
+                    for (j = 0; j < 16; j++)
1903
+                        tmp[j] = get_bits(&gb, bit_depth);
1904
+                }
1905
+            }
1906
+        } else {
1907
+            for (p = 0; p < plane_count; p++) {
1908
+                for (i = 0; i < 16; i++) {
1909
+                    memcpy(dest[p] + i*linesize, h->mb + p*128 + i*8, 16);
1910
+                }
1911
+            }
1912
+        }
1913
+    } else {
1914
+        if(IS_INTRA(mb_type)){
1915
+            if(h->deblocking_filter)
1916
+                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize, linesize, 1, 1, simple, pixel_shift);
1917
+
1918
+            for (p = 0; p < plane_count; p++)
1919
+                hl_decode_mb_predict_luma(h, mb_type, 1, simple, transform_bypass, pixel_shift, block_offset, linesize, dest[p], p);
1920
+
1921
+            if(h->deblocking_filter)
1922
+                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize, linesize, 0, 1, simple, pixel_shift);
1923
+        }else{
1924
+            if (pixel_shift) {
1925
+                hl_motion_16(h, dest[0], dest[1], dest[2],
1926
+                             s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
1927
+                             s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
1928
+                             h->h264dsp.weight_h264_pixels_tab,
1929
+                             h->h264dsp.biweight_h264_pixels_tab, 1);
1930
+            } else
1931
+                hl_motion_8(h, dest[0], dest[1], dest[2],
1932
+                            s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
1933
+                            s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
1934
+                            h->h264dsp.weight_h264_pixels_tab,
1935
+                            h->h264dsp.biweight_h264_pixels_tab, 1);
1936
+        }
1937
+
1938
+        for (p = 0; p < plane_count; p++)
1939
+            hl_decode_mb_idct_luma(h, mb_type, 1, simple, transform_bypass, pixel_shift, block_offset, linesize, dest[p], p);
1940
+    }
1941
+    if(h->cbp || IS_INTRA(mb_type))
1942
+    {
1839 1943
         s->dsp.clear_blocks(h->mb);
1944
+        s->dsp.clear_blocks(h->mb+(24*16<<pixel_shift));
1945
+    }
1840 1946
 }
1841 1947
 
1842 1948
 /**
... ...
@@ -1856,13 +2070,26 @@ static void av_noinline hl_decode_mb_complex(H264Context *h){
1856 1856
     hl_decode_mb_internal(h, 0, h->pixel_shift);
1857 1857
 }
1858 1858
 
1859
+static void av_noinline hl_decode_mb_444_complex(H264Context *h){
1860
+    hl_decode_mb_444_internal(h, 0, h->pixel_shift);
1861
+}
1862
+
1863
+static void av_noinline hl_decode_mb_444_simple(H264Context *h){
1864
+    hl_decode_mb_444_internal(h, 1, 0);
1865
+}
1866
+
1859 1867
 void ff_h264_hl_decode_mb(H264Context *h){
1860 1868
     MpegEncContext * const s = &h->s;
1861 1869
     const int mb_xy= h->mb_xy;
1862 1870
     const int mb_type= s->current_picture.mb_type[mb_xy];
1863 1871
     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
1864 1872
 
1865
-    if (is_complex) {
1873
+    if (CHROMA444) {
1874
+        if(is_complex || h->pixel_shift)
1875
+            hl_decode_mb_444_complex(h);
1876
+        else
1877
+            hl_decode_mb_444_simple(h);
1878
+    } else if (is_complex) {
1866 1879
         hl_decode_mb_complex(h);
1867 1880
     } else if (h->pixel_shift) {
1868 1881
         hl_decode_mb_simple_16(h);
... ...
@@ -1878,7 +2105,7 @@ static int pred_weight_table(H264Context *h){
1878 1878
     h->use_weight= 0;
1879 1879
     h->use_weight_chroma= 0;
1880 1880
     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
1881
-    if(CHROMA)
1881
+    if(h->sps.chroma_format_idc)
1882 1882
         h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
1883 1883
     luma_def = 1<<h->luma_log2_weight_denom;
1884 1884
     chroma_def = 1<<h->chroma_log2_weight_denom;
... ...
@@ -1903,7 +2130,7 @@ static int pred_weight_table(H264Context *h){
1903 1903
                 h->luma_weight[i][list][1]= 0;
1904 1904
             }
1905 1905
 
1906
-            if(CHROMA){
1906
+            if(h->sps.chroma_format_idc){
1907 1907
                 chroma_weight_flag= get_bits1(&s->gb);
1908 1908
                 if(chroma_weight_flag){
1909 1909
                     int j;
... ...
@@ -2333,11 +2560,11 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
2333 2333
 
2334 2334
     h->b_stride=  s->mb_width*4;
2335 2335
 
2336
-    s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
2336
+    s->width = 16*s->mb_width - (2>>CHROMA444)*FFMIN(h->sps.crop_right, (8<<CHROMA444)-1);
2337 2337
     if(h->sps.frame_mbs_only_flag)
2338
-        s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
2338
+        s->height= 16*s->mb_height - (2>>CHROMA444)*FFMIN(h->sps.crop_bottom, (8<<CHROMA444)-1);
2339 2339
     else
2340
-        s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 7);
2340
+        s->height= 16*s->mb_height - (4>>CHROMA444)*FFMIN(h->sps.crop_bottom, (8<<CHROMA444)-1);
2341 2341
 
2342 2342
     if (s->context_initialized
2343 2343
         && (   s->width != s->avctx->width || s->height != s->avctx->height
... ...
@@ -2382,18 +2609,22 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
2382 2382
 
2383 2383
         switch (h->sps.bit_depth_luma) {
2384 2384
             case 9 :
2385
-                s->avctx->pix_fmt = PIX_FMT_YUV420P9;
2385
+                s->avctx->pix_fmt = CHROMA444 ? PIX_FMT_YUV444P9 : PIX_FMT_YUV420P9;
2386 2386
                 break;
2387 2387
             case 10 :
2388
-                s->avctx->pix_fmt = PIX_FMT_YUV420P10;
2388
+                s->avctx->pix_fmt = CHROMA444 ? PIX_FMT_YUV444P10 : PIX_FMT_YUV420P10;
2389 2389
                 break;
2390 2390
             default:
2391
-        s->avctx->pix_fmt = s->avctx->get_format(s->avctx,
2392
-                                                 s->avctx->codec->pix_fmts ?
2393
-                                                 s->avctx->codec->pix_fmts :
2394
-                                                 s->avctx->color_range == AVCOL_RANGE_JPEG ?
2395
-                                                 hwaccel_pixfmt_list_h264_jpeg_420 :
2396
-                                                 ff_hwaccel_pixfmt_list_420);
2391
+                if (CHROMA444){
2392
+                    s->avctx->pix_fmt = s->avctx->color_range == AVCOL_RANGE_JPEG ? PIX_FMT_YUVJ444P : PIX_FMT_YUV444P;
2393
+                }else{
2394
+                    s->avctx->pix_fmt = s->avctx->get_format(s->avctx,
2395
+                                                             s->avctx->codec->pix_fmts ?
2396
+                                                             s->avctx->codec->pix_fmts :
2397
+                                                             s->avctx->color_range == AVCOL_RANGE_JPEG ?
2398
+                                                             hwaccel_pixfmt_list_h264_jpeg_420 :
2399
+                                                             ff_hwaccel_pixfmt_list_420);
2400
+                }
2397 2401
         }
2398 2402
 
2399 2403
         s->avctx->hwaccel = ff_find_hwaccel(s->avctx->codec->id, s->avctx->pix_fmt);
... ...
@@ -2896,10 +3127,10 @@ static int fill_filter_caches(H264Context *h, int mb_type){
2896 2896
     if(IS_INTRA(mb_type))
2897 2897
         return 0;
2898 2898
 
2899
-    AV_COPY32(&h->non_zero_count_cache[4+8*1], &h->non_zero_count[mb_xy][ 4]);
2900
-    AV_COPY32(&h->non_zero_count_cache[4+8*2], &h->non_zero_count[mb_xy][12]);
2901
-    AV_COPY32(&h->non_zero_count_cache[4+8*3], &h->non_zero_count[mb_xy][20]);
2902
-    AV_COPY32(&h->non_zero_count_cache[4+8*4], &h->non_zero_count[mb_xy][28]);
2899
+    AV_COPY32(&h->non_zero_count_cache[4+8* 1], &h->non_zero_count[mb_xy][ 0]);
2900
+    AV_COPY32(&h->non_zero_count_cache[4+8* 2], &h->non_zero_count[mb_xy][ 4]);
2901
+    AV_COPY32(&h->non_zero_count_cache[4+8* 3], &h->non_zero_count[mb_xy][ 8]);
2902
+    AV_COPY32(&h->non_zero_count_cache[4+8* 4], &h->non_zero_count[mb_xy][12]);
2903 2903
 
2904 2904
     h->cbp= h->cbp_table[mb_xy];
2905 2905
 
... ...
@@ -2951,45 +3182,45 @@ static int fill_filter_caches(H264Context *h, int mb_type){
2951 2951
 */
2952 2952
 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
2953 2953
     if(top_type){
2954
-        AV_COPY32(&h->non_zero_count_cache[4+8*0], &h->non_zero_count[top_xy][4+3*8]);
2954
+        AV_COPY32(&h->non_zero_count_cache[4+8*0], &h->non_zero_count[top_xy][3*4]);
2955 2955
     }
2956 2956
 
2957 2957
     if(left_type[0]){
2958
-        h->non_zero_count_cache[3+8*1]= h->non_zero_count[left_xy[0]][7+0*8];
2959
-        h->non_zero_count_cache[3+8*2]= h->non_zero_count[left_xy[0]][7+1*8];
2960
-        h->non_zero_count_cache[3+8*3]= h->non_zero_count[left_xy[0]][7+2*8];
2961
-        h->non_zero_count_cache[3+8*4]= h->non_zero_count[left_xy[0]][7+3*8];
2958
+        h->non_zero_count_cache[3+8*1]= h->non_zero_count[left_xy[0]][3+0*4];
2959
+        h->non_zero_count_cache[3+8*2]= h->non_zero_count[left_xy[0]][3+1*4];
2960
+        h->non_zero_count_cache[3+8*3]= h->non_zero_count[left_xy[0]][3+2*4];
2961
+        h->non_zero_count_cache[3+8*4]= h->non_zero_count[left_xy[0]][3+3*4];
2962 2962
     }
2963 2963
 
2964 2964
     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
2965 2965
     if(!CABAC && h->pps.transform_8x8_mode){
2966 2966
         if(IS_8x8DCT(top_type)){
2967 2967
             h->non_zero_count_cache[4+8*0]=
2968
-            h->non_zero_count_cache[5+8*0]= h->cbp_table[top_xy] & 4;
2968
+            h->non_zero_count_cache[5+8*0]= (h->cbp_table[top_xy] & 0x4000) >> 12;
2969 2969
             h->non_zero_count_cache[6+8*0]=
2970
-            h->non_zero_count_cache[7+8*0]= h->cbp_table[top_xy] & 8;
2970
+            h->non_zero_count_cache[7+8*0]= (h->cbp_table[top_xy] & 0x8000) >> 12;
2971 2971
         }
2972 2972
         if(IS_8x8DCT(left_type[0])){
2973 2973
             h->non_zero_count_cache[3+8*1]=
2974
-            h->non_zero_count_cache[3+8*2]= h->cbp_table[left_xy[0]]&2; //FIXME check MBAFF
2974
+            h->non_zero_count_cache[3+8*2]= (h->cbp_table[left_xy[0]]&0x2000) >> 12; //FIXME check MBAFF
2975 2975
         }
2976 2976
         if(IS_8x8DCT(left_type[1])){
2977 2977
             h->non_zero_count_cache[3+8*3]=
2978
-            h->non_zero_count_cache[3+8*4]= h->cbp_table[left_xy[1]]&8; //FIXME check MBAFF
2978
+            h->non_zero_count_cache[3+8*4]= (h->cbp_table[left_xy[1]]&0x8000) >> 12; //FIXME check MBAFF
2979 2979
         }
2980 2980
 
2981 2981
         if(IS_8x8DCT(mb_type)){
2982 2982
             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
2983
-            h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
2983
+            h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= (h->cbp & 0x1000) >> 12;
2984 2984
 
2985 2985
             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
2986
-            h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
2986
+            h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= (h->cbp & 0x2000) >> 12;
2987 2987
 
2988 2988
             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
2989
-            h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
2989
+            h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= (h->cbp & 0x4000) >> 12;
2990 2990
 
2991 2991
             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
2992
-            h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
2992
+            h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= (h->cbp & 0x8000) >> 12;
2993 2993
         }
2994 2994
     }
2995 2995
 
... ...
@@ -3063,8 +3294,8 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
3063 3063
                 s->mb_x= mb_x;
3064 3064
                 s->mb_y= mb_y;
3065 3065
                 dest_y  = s->current_picture.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
3066
-                dest_cb = s->current_picture.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
3067
-                dest_cr = s->current_picture.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * 8;
3066
+                dest_cb = s->current_picture.data[1] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * (8 << CHROMA444);
3067
+                dest_cr = s->current_picture.data[2] + ((mb_x << pixel_shift) + mb_y * s->uvlinesize) * (8 << CHROMA444);
3068 3068
                     //FIXME simplify above
3069 3069
 
3070 3070
                 if (MB_FIELD) {
... ...
@@ -3079,7 +3310,7 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
3079 3079
                     linesize   = h->mb_linesize   = s->linesize;
3080 3080
                     uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3081 3081
                 }
3082
-                backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3082
+                backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, CHROMA444, 0);
3083 3083
                 if(fill_filter_caches(h, mb_type))
3084 3084
                     continue;
3085 3085
                 h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
... ...
@@ -39,9 +39,6 @@
39 39
 #define interlaced_dct interlaced_dct_is_a_bad_name
40 40
 #define mb_intra mb_intra_is_not_initialized_see_mb_type
41 41
 
42
-#define LUMA_DC_BLOCK_INDEX   24
43
-#define CHROMA_DC_BLOCK_INDEX 25
44
-
45 42
 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
46 43
 #define COEFF_TOKEN_VLC_BITS           8
47 44
 #define TOTAL_ZEROS_VLC_BITS           9
... ...
@@ -60,8 +57,6 @@
60 60
  * of progressive decoding by about 2%. */
61 61
 #define ALLOW_INTERLACE
62 62
 
63
-#define ALLOW_NOCHROMA
64
-
65 63
 #define FMO 0
66 64
 
67 65
 /**
... ...
@@ -85,16 +80,12 @@
85 85
 #endif
86 86
 #define FIELD_OR_MBAFF_PICTURE (FRAME_MBAFF || FIELD_PICTURE)
87 87
 
88
-#ifdef ALLOW_NOCHROMA
89
-#define CHROMA h->sps.chroma_format_idc
90
-#else
91
-#define CHROMA 1
92
-#endif
93
-
94 88
 #ifndef CABAC
95 89
 #define CABAC h->pps.cabac
96 90
 #endif
97 91
 
92
+#define CHROMA444 (h->sps.chroma_format_idc == 3)
93
+
98 94
 #define EXTENDED_SAR          255
99 95
 
100 96
 #define MB_TYPE_REF0       MB_TYPE_ACPRED //dirty but it fits in 16 bit
... ...
@@ -198,7 +189,7 @@ typedef struct SPS{
198 198
     int num_reorder_frames;
199 199
     int scaling_matrix_present;
200 200
     uint8_t scaling_matrix4[6][16];
201
-    uint8_t scaling_matrix8[2][64];
201
+    uint8_t scaling_matrix8[6][64];
202 202
     int nal_hrd_parameters_present_flag;
203 203
     int vcl_hrd_parameters_present_flag;
204 204
     int pic_struct_present_flag;
... ...
@@ -233,7 +224,7 @@ typedef struct PPS{
233 233
     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
234 234
     int transform_8x8_mode;     ///< transform_8x8_mode_flag
235 235
     uint8_t scaling_matrix4[6][16];
236
-    uint8_t scaling_matrix8[2][64];
236
+    uint8_t scaling_matrix8[6][64];
237 237
     uint8_t chroma_qp_table[2][64];  ///< pre-scaled (with chroma_qp_index_offset) version of qp_table
238 238
     int chroma_qp_diff;
239 239
 }PPS;
... ...
@@ -298,21 +289,15 @@ typedef struct H264Context{
298 298
     unsigned int top_samples_available;
299 299
     unsigned int topright_samples_available;
300 300
     unsigned int left_samples_available;
301
-    uint8_t (*top_borders[2])[(16+2*8)*2];
301
+    uint8_t (*top_borders[2])[(16*3)*2];
302 302
 
303 303
     /**
304 304
      * non zero coeff count cache.
305 305
      * is 64 if not available.
306 306
      */
307
-    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8];
307
+    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[15*8];
308 308
 
309
-    /*
310
-    .UU.YYYY
311
-    .UU.YYYY
312
-    .vv.YYYY
313
-    .VV.YYYY
314
-    */
315
-    uint8_t (*non_zero_count)[32];
309
+    uint8_t (*non_zero_count)[48];
316 310
 
317 311
     /**
318 312
      * Motion vector cache.
... ...
@@ -336,7 +321,7 @@ typedef struct H264Context{
336 336
      * block_offset[ 0..23] for frame macroblocks
337 337
      * block_offset[24..47] for field macroblocks
338 338
      */
339
-    int block_offset[2*(16+8)];
339
+    int block_offset[2*(16*3)];
340 340
 
341 341
     uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
342 342
     uint32_t *mb2br_xy;
... ...
@@ -356,9 +341,9 @@ typedef struct H264Context{
356 356
     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
357 357
 
358 358
     uint32_t dequant4_buffer[6][QP_MAX_NUM+1][16]; //FIXME should these be moved down?
359
-    uint32_t dequant8_buffer[2][QP_MAX_NUM+1][64];
359
+    uint32_t dequant8_buffer[6][QP_MAX_NUM+1][64];
360 360
     uint32_t (*dequant4_coeff[6])[16];
361
-    uint32_t (*dequant8_coeff[2])[64];
361
+    uint32_t (*dequant8_coeff[6])[64];
362 362
 
363 363
     int slice_num;
364 364
     uint16_t *slice_table;     ///< slice_table_base + 2*mb_stride + 1
... ...
@@ -408,15 +393,15 @@ typedef struct H264Context{
408 408
     GetBitContext *intra_gb_ptr;
409 409
     GetBitContext *inter_gb_ptr;
410 410
 
411
-    DECLARE_ALIGNED(16, DCTELEM, mb)[16*24*2]; ///< as a dct coeffecient is int32_t in high depth, we need to reserve twice the space.
412
-    DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16*2];
411
+    DECLARE_ALIGNED(16, DCTELEM, mb)[16*48*2]; ///< as a dct coeffecient is int32_t in high depth, we need to reserve twice the space.
412
+    DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[3][16*2];
413 413
     DCTELEM mb_padding[256*2];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
414 414
 
415 415
     /**
416 416
      * Cabac
417 417
      */
418 418
     CABACContext cabac;
419
-    uint8_t      cabac_state[460];
419
+    uint8_t      cabac_state[1024];
420 420
 
421 421
     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
422 422
     uint16_t     *cbp_table;
... ...
@@ -721,27 +706,43 @@ o-o o-o
721 721
 */
722 722
 
723 723
 /* Scan8 organization:
724
- *   0 1 2 3 4 5 6 7
725
- * 0   u u y y y y y
726
- * 1 u U U y Y Y Y Y
727
- * 2 u U U y Y Y Y Y
728
- * 3   v v y Y Y Y Y
729
- * 4 v V V y Y Y Y Y
730
- * 5 v V V   DYDUDV
724
+ *    0 1 2 3 4 5 6 7
725
+ * 0  DY    y y y y y
726
+ * 1        y Y Y Y Y
727
+ * 2        y Y Y Y Y
728
+ * 3        y Y Y Y Y
729
+ * 4        y Y Y Y Y
730
+ * 5  DU    u u u u u
731
+ * 6        u U U U U
732
+ * 7        u U U U U
733
+ * 8        u U U U U
734
+ * 9        u U U U U
735
+ * 10 DV    v v v v v
736
+ * 11       v V V V V
737
+ * 12       v V V V V
738
+ * 13       v V V V V
739
+ * 14       v V V V V
731 740
  * DY/DU/DV are for luma/chroma DC.
732 741
  */
733 742
 
743
+#define LUMA_DC_BLOCK_INDEX   48
744
+#define CHROMA_DC_BLOCK_INDEX 49
745
+
734 746
 //This table must be here because scan8[constant] must be known at compiletime
735
-static const uint8_t scan8[16 + 2*4 + 3]={
736
- 4+1*8, 5+1*8, 4+2*8, 5+2*8,
737
- 6+1*8, 7+1*8, 6+2*8, 7+2*8,
738
- 4+3*8, 5+3*8, 4+4*8, 5+4*8,
739
- 6+3*8, 7+3*8, 6+4*8, 7+4*8,
740
- 1+1*8, 2+1*8,
741
- 1+2*8, 2+2*8,
742
- 1+4*8, 2+4*8,
743
- 1+5*8, 2+5*8,
744
- 4+5*8, 5+5*8, 6+5*8
747
+static const uint8_t scan8[16*3 + 3]={
748
+ 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
749
+ 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
750
+ 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
751
+ 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
752
+ 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
753
+ 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
754
+ 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
755
+ 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
756
+ 4+11*8, 5+11*8, 4+12*8, 5+12*8,
757
+ 6+11*8, 7+11*8, 6+12*8, 7+12*8,
758
+ 4+13*8, 5+13*8, 4+14*8, 5+14*8,
759
+ 6+13*8, 7+13*8, 6+14*8, 7+14*8,
760
+ 0+ 0*8, 0+ 5*8, 0+10*8
745 761
 };
746 762
 
747 763
 static av_always_inline uint32_t pack16to32(int a, int b){
... ...
@@ -773,11 +774,11 @@ static void fill_decode_neighbors(H264Context *h, int mb_type){
773 773
     MpegEncContext * const s = &h->s;
774 774
     const int mb_xy= h->mb_xy;
775 775
     int topleft_xy, top_xy, topright_xy, left_xy[2];
776
-    static const uint8_t left_block_options[4][16]={
777
-        {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
778
-        {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
779
-        {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
780
-        {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
776
+    static const uint8_t left_block_options[4][32]={
777
+        {0,1,2,3,7,10,8,11,3+0*4, 3+1*4, 3+2*4, 3+3*4, 1+4*4, 1+8*4, 1+5*4, 1+9*4},
778
+        {2,2,3,3,8,11,8,11,3+2*4, 3+2*4, 3+3*4, 3+3*4, 1+5*4, 1+9*4, 1+5*4, 1+9*4},
779
+        {0,0,1,1,7,10,7,10,3+0*4, 3+0*4, 3+1*4, 3+1*4, 1+4*4, 1+8*4, 1+4*4, 1+8*4},
780
+        {0,2,0,2,7,10,7,10,3+0*4, 3+2*4, 3+0*4, 3+2*4, 1+4*4, 1+8*4, 1+4*4, 1+8*4}
781 781
     };
782 782
 
783 783
     h->topleft_partition= -1;
... ...
@@ -947,32 +948,41 @@ static void fill_decode_caches(H264Context *h, int mb_type){
947 947
 */
948 948
 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
949 949
     if(top_type){
950
-        AV_COPY32(&h->non_zero_count_cache[4+8*0], &h->non_zero_count[top_xy][4+3*8]);
951
-            h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][1+1*8];
952
-            h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][2+1*8];
953
-
954
-            h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][1+2*8];
955
-            h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][2+2*8];
956
-    }else {
957
-            h->non_zero_count_cache[1+8*0]=
958
-            h->non_zero_count_cache[2+8*0]=
959
-
960
-            h->non_zero_count_cache[1+8*3]=
961
-            h->non_zero_count_cache[2+8*3]=
962
-            AV_WN32A(&h->non_zero_count_cache[4+8*0], CABAC && !IS_INTRA(mb_type) ? 0 : 0x40404040);
950
+        AV_COPY32(&h->non_zero_count_cache[4+8* 0], &h->non_zero_count[top_xy][4*3]);
951
+        if(CHROMA444){
952
+            AV_COPY32(&h->non_zero_count_cache[4+8* 5], &h->non_zero_count[top_xy][4* 7]);
953
+            AV_COPY32(&h->non_zero_count_cache[4+8*10], &h->non_zero_count[top_xy][4*11]);
954
+        }else{
955
+            AV_COPY32(&h->non_zero_count_cache[4+8* 5], &h->non_zero_count[top_xy][4* 5]);
956
+            AV_COPY32(&h->non_zero_count_cache[4+8*10], &h->non_zero_count[top_xy][4* 9]);
957
+        }
958
+    }else{
959
+        uint32_t top_empty = CABAC && !IS_INTRA(mb_type) ? 0 : 0x40404040;
960
+        AV_WN32A(&h->non_zero_count_cache[4+8* 0], top_empty);
961
+        AV_WN32A(&h->non_zero_count_cache[4+8* 5], top_empty);
962
+        AV_WN32A(&h->non_zero_count_cache[4+8*10], top_empty);
963 963
     }
964 964
 
965 965
     for (i=0; i<2; i++) {
966 966
         if(left_type[i]){
967
-            h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]];
968
-            h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]];
969
-                h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[8+4+2*i]];
970
-                h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[8+5+2*i]];
967
+            h->non_zero_count_cache[3+8* 1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]];
968
+            h->non_zero_count_cache[3+8* 2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]];
969
+            if(CHROMA444){
970
+                h->non_zero_count_cache[3+8* 6 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]+4*4];
971
+                h->non_zero_count_cache[3+8* 7 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]+4*4];
972
+                h->non_zero_count_cache[3+8*11 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]+8*4];
973
+                h->non_zero_count_cache[3+8*12 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]+8*4];
974
+            }else{
975
+                h->non_zero_count_cache[3+8* 6 +   8*i]= h->non_zero_count[left_xy[i]][left_block[8+4+2*i]];
976
+                h->non_zero_count_cache[3+8*11 +   8*i]= h->non_zero_count[left_xy[i]][left_block[8+5+2*i]];
977
+            }
971 978
         }else{
972
-                h->non_zero_count_cache[3+8*1 + 2*8*i]=
973
-                h->non_zero_count_cache[3+8*2 + 2*8*i]=
974
-                h->non_zero_count_cache[0+8*1 +   8*i]=
975
-                h->non_zero_count_cache[0+8*4 +   8*i]= CABAC && !IS_INTRA(mb_type) ? 0 : 64;
979
+            h->non_zero_count_cache[3+8* 1 + 2*8*i]=
980
+            h->non_zero_count_cache[3+8* 2 + 2*8*i]=
981
+            h->non_zero_count_cache[3+8* 6 + 2*8*i]=
982
+            h->non_zero_count_cache[3+8* 7 + 2*8*i]=
983
+            h->non_zero_count_cache[3+8*11 + 2*8*i]=
984
+            h->non_zero_count_cache[3+8*12 + 2*8*i]= CABAC && !IS_INTRA(mb_type) ? 0 : 64;
976 985
         }
977 986
     }
978 987
 
... ...
@@ -981,15 +991,15 @@ static void fill_decode_caches(H264Context *h, int mb_type){
981 981
         if(top_type) {
982 982
             h->top_cbp = h->cbp_table[top_xy];
983 983
         } else {
984
-            h->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
984
+            h->top_cbp = IS_INTRA(mb_type) ? 0x7CF : 0x00F;
985 985
         }
986 986
         // left_cbp
987 987
         if (left_type[0]) {
988
-            h->left_cbp = (h->cbp_table[left_xy[0]] & 0x1f0)
988
+            h->left_cbp =   (h->cbp_table[left_xy[0]] & 0x7F0)
989 989
                         |  ((h->cbp_table[left_xy[0]]>>(left_block[0]&(~1)))&2)
990 990
                         | (((h->cbp_table[left_xy[1]]>>(left_block[2]&(~1)))&2) << 2);
991 991
         } else {
992
-            h->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
992
+            h->left_cbp = IS_INTRA(mb_type) ? 0x7CF : 0x00F;
993 993
         }
994 994
     }
995 995
     }
... ...
@@ -1190,11 +1200,21 @@ static inline int pred_intra_mode(H264Context *h, int n){
1190 1190
 static inline void write_back_non_zero_count(H264Context *h){
1191 1191
     const int mb_xy= h->mb_xy;
1192 1192
 
1193
-    AV_COPY64(&h->non_zero_count[mb_xy][ 0], &h->non_zero_count_cache[0+8*1]);
1194
-    AV_COPY64(&h->non_zero_count[mb_xy][ 8], &h->non_zero_count_cache[0+8*2]);
1195
-    AV_COPY32(&h->non_zero_count[mb_xy][16], &h->non_zero_count_cache[0+8*5]);
1196
-    AV_COPY32(&h->non_zero_count[mb_xy][20], &h->non_zero_count_cache[4+8*3]);
1197
-    AV_COPY64(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[0+8*4]);
1193
+    AV_COPY32(&h->non_zero_count[mb_xy][ 0], &h->non_zero_count_cache[4+8* 1]);
1194
+    AV_COPY32(&h->non_zero_count[mb_xy][ 4], &h->non_zero_count_cache[4+8* 2]);
1195
+    AV_COPY32(&h->non_zero_count[mb_xy][ 8], &h->non_zero_count_cache[4+8* 3]);
1196
+    AV_COPY32(&h->non_zero_count[mb_xy][12], &h->non_zero_count_cache[4+8* 4]);
1197
+    AV_COPY32(&h->non_zero_count[mb_xy][16], &h->non_zero_count_cache[4+8* 6]);
1198
+    AV_COPY32(&h->non_zero_count[mb_xy][20], &h->non_zero_count_cache[4+8* 7]);
1199
+    AV_COPY32(&h->non_zero_count[mb_xy][32], &h->non_zero_count_cache[4+8*11]);
1200
+    AV_COPY32(&h->non_zero_count[mb_xy][36], &h->non_zero_count_cache[4+8*12]);
1201
+
1202
+    if(CHROMA444){
1203
+        AV_COPY32(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[4+8* 8]);
1204
+        AV_COPY32(&h->non_zero_count[mb_xy][28], &h->non_zero_count_cache[4+8* 9]);
1205
+        AV_COPY32(&h->non_zero_count[mb_xy][40], &h->non_zero_count_cache[4+8*13]);
1206
+        AV_COPY32(&h->non_zero_count[mb_xy][44], &h->non_zero_count_cache[4+8*14]);
1207
+    }
1198 1208
 }
1199 1209
 
1200 1210
 static inline void write_back_motion(H264Context *h, int mb_type){
... ...
@@ -1267,8 +1287,7 @@ static void av_unused decode_mb_skip(H264Context *h){
1267 1267
     const int mb_xy= h->mb_xy;
1268 1268
     int mb_type=0;
1269 1269
 
1270
-    memset(h->non_zero_count[mb_xy], 0, 32);
1271
-    memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
1270
+    memset(h->non_zero_count[mb_xy], 0, 48);
1272 1271
 
1273 1272
     if(MB_FIELD)
1274 1273
         mb_type|= MB_TYPE_INTERLACED;
... ...
@@ -45,7 +45,7 @@
45 45
 
46 46
 /* Cabac pre state table */
47 47
 
48
-static const int8_t cabac_context_init_I[460][2] =
48
+static const int8_t cabac_context_init_I[1024][2] =
49 49
 {
50 50
     /* 0 - 10 */
51 51
     { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
... ...
@@ -211,10 +211,153 @@ static const int8_t cabac_context_init_I[460][2] =
211 211
     { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
212 212
     {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
213 213
     {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
214
-    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 }
214
+    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 },
215
+
216
+    /* 460 -> 1024 */
217
+    { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 },
218
+    { -12,  63 }, {  -2,  68 }, { -15,  84 }, { -13, 104 },
219
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
220
+    { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 },
221
+    { -12,  63 }, {  -2,  68 }, { -15,  84 }, { -13, 104 },
222
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
223
+    {  -7,  93 }, { -11,  87 }, {  -3,  77 }, {  -5,  71 },
224
+    {  -4,  63 }, {  -4,  68 }, { -12,  84 }, {  -7,  62 },
225
+    {  -7,  65 }, {   8,  61 }, {   5,  56 }, {  -2,  66 },
226
+    {   1,  64 }, {   0,  61 }, {  -2,  78 }, {   1,  50 },
227
+    {   7,  52 }, {  10,  35 }, {   0,  44 }, {  11,  38 },
228
+    {   1,  45 }, {   0,  46 }, {   5,  44 }, {  31,  17 },
229
+    {   1,  51 }, {   7,  50 }, {  28,  19 }, {  16,  33 },
230
+    {  14,  62 }, { -13, 108 }, { -15, 100 }, { -13, 101 },
231
+    { -13,  91 }, { -12,  94 }, { -10,  88 }, { -16,  84 },
232
+    { -10,  86 }, {  -7,  83 }, { -13,  87 }, { -19,  94 },
233
+    {   1,  70 }, {   0,  72 }, {  -5,  74 }, {  18,  59 },
234
+    {  -7,  93 }, { -11,  87 }, {  -3,  77 }, {  -5,  71 },
235
+    {  -4,  63 }, {  -4,  68 }, { -12,  84 }, {  -7,  62 },
236
+    {  -7,  65 }, {   8,  61 }, {   5,  56 }, {  -2,  66 },
237
+    {   1,  64 }, {   0,  61 }, {  -2,  78 }, {   1,  50 },
238
+    {   7,  52 }, {  10,  35 }, {   0,  44 }, {  11,  38 },
239
+    {   1,  45 }, {   0,  46 }, {   5,  44 }, {  31,  17 },
240
+    {   1,  51 }, {   7,  50 }, {  28,  19 }, {  16,  33 },
241
+    {  14,  62 }, { -13, 108 }, { -15, 100 }, { -13, 101 },
242
+    { -13,  91 }, { -12,  94 }, { -10,  88 }, { -16,  84 },
243
+    { -10,  86 }, {  -7,  83 }, { -13,  87 }, { -19,  94 },
244
+    {   1,  70 }, {   0,  72 }, {  -5,  74 }, {  18,  59 },
245
+    {  24,   0 }, {  15,   9 }, {   8,  25 }, {  13,  18 },
246
+    {  15,   9 }, {  13,  19 }, {  10,  37 }, {  12,  18 },
247
+    {   6,  29 }, {  20,  33 }, {  15,  30 }, {   4,  45 },
248
+    {   1,  58 }, {   0,  62 }, {   7,  61 }, {  12,  38 },
249
+    {  11,  45 }, {  15,  39 }, {  11,  42 }, {  13,  44 },
250
+    {  16,  45 }, {  12,  41 }, {  10,  49 }, {  30,  34 },
251
+    {  18,  42 }, {  10,  55 }, {  17,  51 }, {  17,  46 },
252
+    {   0,  89 }, {  26, -19 }, {  22, -17 }, {  26, -17 },
253
+    {  30, -25 }, {  28, -20 }, {  33, -23 }, {  37, -27 },
254
+    {  33, -23 }, {  40, -28 }, {  38, -17 }, {  33, -11 },
255
+    {  40, -15 }, {  41,  -6 }, {  38,   1 }, {  41,  17 },
256
+    {  24,   0 }, {  15,   9 }, {   8,  25 }, {  13,  18 },
257
+    {  15,   9 }, {  13,  19 }, {  10,  37 }, {  12,  18 },
258
+    {   6,  29 }, {  20,  33 }, {  15,  30 }, {   4,  45 },
259
+    {   1,  58 }, {   0,  62 }, {   7,  61 }, {  12,  38 },
260
+    {  11,  45 }, {  15,  39 }, {  11,  42 }, {  13,  44 },
261
+    {  16,  45 }, {  12,  41 }, {  10,  49 }, {  30,  34 },
262
+    {  18,  42 }, {  10,  55 }, {  17,  51 }, {  17,  46 },
263
+    {   0,  89 }, {  26, -19 }, {  22, -17 }, {  26, -17 },
264
+    {  30, -25 }, {  28, -20 }, {  33, -23 }, {  37, -27 },
265
+    {  33, -23 }, {  40, -28 }, {  38, -17 }, {  33, -11 },
266
+    {  40, -15 }, {  41,  -6 }, {  38,   1 }, {  41,  17 },
267
+    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
268
+    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
269
+    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
270
+    { -23,  68 }, { -24,  50 }, { -11,  74 }, { -14, 106 },
271
+    { -13,  97 }, { -15,  90 }, { -12,  90 }, { -18,  88 },
272
+    { -10,  73 }, {  -9,  79 }, { -14,  86 }, { -10,  73 },
273
+    { -10,  70 }, { -10,  69 }, {  -5,  66 }, {  -9,  64 },
274
+    {  -5,  58 }, {   2,  59 }, {  23, -13 }, {  26, -13 },
275
+    {  40, -15 }, {  49, -14 }, {  44,   3 }, {  45,   6 },
276
+    {  44,  34 }, {  33,  54 }, {  19,  82 }, {  21, -10 },
277
+    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
278
+    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 },
279
+    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
280
+    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
281
+    {   0,  68 }, {  -9,  92 }, { -17, 120 }, { -20, 112 },
282
+    { -18, 114 }, { -11,  85 }, { -15,  92 }, { -14,  89 },
283
+    { -26,  71 }, { -15,  81 }, { -14,  80 }, {   0,  68 },
284
+    { -14,  70 }, { -24,  56 }, { -23,  68 }, { -24,  50 },
285
+    { -11,  74 }, { -14, 106 }, { -13,  97 }, { -15,  90 },
286
+    { -12,  90 }, { -18,  88 }, { -10,  73 }, {  -9,  79 },
287
+    { -14,  86 }, { -10,  73 }, { -10,  70 }, { -10,  69 },
288
+    {  -5,  66 }, {  -9,  64 }, {  -5,  58 }, {   2,  59 },
289
+    {  23, -13 }, {  26, -13 }, {  40, -15 }, {  49, -14 },
290
+    {  44,   3 }, {  45,   6 }, {  44,  34 }, {  33,  54 },
291
+    {  19,  82 }, {  21, -10 }, {  24, -11 }, {  28,  -8 },
292
+    {  28,  -1 }, {  29,   3 }, {  29,   9 }, {  35,  20 },
293
+    {  29,  36 }, {  14,  67 }, {  -3,  75 }, {  -1,  23 },
294
+    {   1,  34 }, {   1,  43 }, {   0,  54 }, {  -2,  55 },
295
+    {   0,  61 }, {   1,  64 }, {   0,  68 }, {  -9,  92 },
296
+    {  -6,  93 }, {  -6,  84 }, {  -8,  79 }, {   0,  66 },
297
+    {  -1,  71 }, {   0,  62 }, {  -2,  60 }, {  -2,  59 },
298
+    {  -5,  75 }, {  -3,  62 }, {  -4,  58 }, {  -9,  66 },
299
+    {  -1,  79 }, {   0,  71 }, {   3,  68 }, {  10,  44 },
300
+    {  -7,  62 }, {  15,  36 }, {  14,  40 }, {  16,  27 },
301
+    {  12,  29 }, {   1,  44 }, {  20,  36 }, {  18,  32 },
302
+    {   5,  42 }, {   1,  48 }, {  10,  62 }, {  17,  46 },
303
+    {   9,  64 }, { -12, 104 }, { -11,  97 }, { -16,  96 },
304
+    {  -7,  88 }, {  -8,  85 }, {  -7,  85 }, {  -9,  85 },
305
+    { -13,  88 }, {   4,  66 }, {  -3,  77 }, {  -3,  76 },
306
+    {  -6,  76 }, {  10,  58 }, {  -1,  76 }, {  -1,  83 },
307
+    {  -6,  93 }, {  -6,  84 }, {  -8,  79 }, {   0,  66 },
308
+    {  -1,  71 }, {   0,  62 }, {  -2,  60 }, {  -2,  59 },
309
+    {  -5,  75 }, {  -3,  62 }, {  -4,  58 }, {  -9,  66 },
310
+    {  -1,  79 }, {   0,  71 }, {   3,  68 }, {  10,  44 },
311
+    {  -7,  62 }, {  15,  36 }, {  14,  40 }, {  16,  27 },
312
+    {  12,  29 }, {   1,  44 }, {  20,  36 }, {  18,  32 },
313
+    {   5,  42 }, {   1,  48 }, {  10,  62 }, {  17,  46 },
314
+    {   9,  64 }, { -12, 104 }, { -11,  97 }, { -16,  96 },
315
+    {  -7,  88 }, {  -8,  85 }, {  -7,  85 }, {  -9,  85 },
316
+    { -13,  88 }, {   4,  66 }, {  -3,  77 }, {  -3,  76 },
317
+    {  -6,  76 }, {  10,  58 }, {  -1,  76 }, {  -1,  83 },
318
+    {  15,   6 }, {   6,  19 }, {   7,  16 }, {  12,  14 },
319
+    {  18,  13 }, {  13,  11 }, {  13,  15 }, {  15,  16 },
320
+    {  12,  23 }, {  13,  23 }, {  15,  20 }, {  14,  26 },
321
+    {  14,  44 }, {  17,  40 }, {  17,  47 }, {  24,  17 },
322
+    {  21,  21 }, {  25,  22 }, {  31,  27 }, {  22,  29 },
323
+    {  19,  35 }, {  14,  50 }, {  10,  57 }, {   7,  63 },
324
+    {  -2,  77 }, {  -4,  82 }, {  -3,  94 }, {   9,  69 },
325
+    { -12, 109 }, {  36, -35 }, {  36, -34 }, {  32, -26 },
326
+    {  37, -30 }, {  44, -32 }, {  34, -18 }, {  34, -15 },
327
+    {  40, -15 }, {  33,  -7 }, {  35,  -5 }, {  33,   0 },
328
+    {  38,   2 }, {  33,  13 }, {  23,  35 }, {  13,  58 },
329
+    {  15,   6 }, {   6,  19 }, {   7,  16 }, {  12,  14 },
330
+    {  18,  13 }, {  13,  11 }, {  13,  15 }, {  15,  16 },
331
+    {  12,  23 }, {  13,  23 }, {  15,  20 }, {  14,  26 },
332
+    {  14,  44 }, {  17,  40 }, {  17,  47 }, {  24,  17 },
333
+    {  21,  21 }, {  25,  22 }, {  31,  27 }, {  22,  29 },
334
+    {  19,  35 }, {  14,  50 }, {  10,  57 }, {   7,  63 },
335
+    {  -2,  77 }, {  -4,  82 }, {  -3,  94 }, {   9,  69 },
336
+    { -12, 109 }, {  36, -35 }, {  36, -34 }, {  32, -26 },
337
+    {  37, -30 }, {  44, -32 }, {  34, -18 }, {  34, -15 },
338
+    {  40, -15 }, {  33,  -7 }, {  35,  -5 }, {  33,   0 },
339
+    {  38,   2 }, {  33,  13 }, {  23,  35 }, {  13,  58 },
340
+    {  -3,  71 }, {  -6,  42 }, {  -5,  50 }, {  -3,  54 },
341
+    {  -2,  62 }, {   0,  58 }, {   1,  63 }, {  -2,  72 },
342
+    {  -1,  74 }, {  -9,  91 }, {  -5,  67 }, {  -5,  27 },
343
+    {  -3,  39 }, {  -2,  44 }, {   0,  46 }, { -16,  64 },
344
+    {  -8,  68 }, { -10,  78 }, {  -6,  77 }, { -10,  86 },
345
+    { -12,  92 }, { -15,  55 }, { -10,  60 }, {  -6,  62 },
346
+    {  -4,  65 }, { -12,  73 }, {  -8,  76 }, {  -7,  80 },
347
+    {  -9,  88 }, { -17, 110 }, {  -3,  71 }, {  -6,  42 },
348
+    {  -5,  50 }, {  -3,  54 }, {  -2,  62 }, {   0,  58 },
349
+    {   1,  63 }, {  -2,  72 }, {  -1,  74 }, {  -9,  91 },
350
+    {  -5,  67 }, {  -5,  27 }, {  -3,  39 }, {  -2,  44 },
351
+    {   0,  46 }, { -16,  64 }, {  -8,  68 }, { -10,  78 },
352
+    {  -6,  77 }, { -10,  86 }, { -12,  92 }, { -15,  55 },
353
+    { -10,  60 }, {  -6,  62 }, {  -4,  65 }, { -12,  73 },
354
+    {  -8,  76 }, {  -7,  80 }, {  -9,  88 }, { -17, 110 },
355
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
356
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
357
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 }
215 358
 };
216 359
 
217
-static const int8_t cabac_context_init_PB[3][460][2] =
360
+static const int8_t cabac_context_init_PB[3][1024][2] =
218 361
 {
219 362
     /* i_cabac_init_idc == 0 */
220 363
     {
... ...
@@ -370,6 +513,149 @@ static const int8_t cabac_context_init_PB[3][460][2] =
370 370
         { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
371 371
         {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
372 372
         {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
373
+
374
+        /* 460 - 1024 */
375
+        {  -7,  92 }, {  -5,  89 }, {  -7,  96 }, { -13, 108 },
376
+        {  -3,  46 }, {  -1,  65 }, {  -1,  57 }, {  -9,  93 },
377
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
378
+        {  -7,  92 }, {  -5,  89 }, {  -7,  96 }, { -13, 108 },
379
+        {  -3,  46 }, {  -1,  65 }, {  -1,  57 }, {  -9,  93 },
380
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
381
+        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
382
+        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
383
+        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
384
+        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
385
+        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
386
+        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
387
+        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
388
+        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
389
+        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
390
+        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
391
+        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
392
+        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
393
+        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
394
+        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
395
+        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
396
+        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
397
+        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
398
+        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
399
+        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
400
+        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
401
+        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
402
+        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
403
+        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
404
+        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
405
+        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
406
+        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
407
+        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
408
+        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
409
+        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
410
+        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
411
+        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
412
+        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
413
+        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
414
+        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
415
+        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
416
+        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
417
+        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
418
+        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
419
+        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
420
+        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
421
+        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
422
+        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
423
+        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
424
+        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
425
+        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
426
+        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
427
+        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
428
+        { -16,  66 }, { -22,  65 }, { -20,  63 }, {  -5,  85 },
429
+        {  -6,  81 }, { -10,  77 }, {  -7,  81 }, { -17,  80 },
430
+        { -18,  73 }, {  -4,  74 }, { -10,  83 }, {  -9,  71 },
431
+        {  -9,  67 }, {  -1,  61 }, {  -8,  66 }, { -14,  66 },
432
+        {   0,  59 }, {   2,  59 }, {   9,  -2 }, {  26,  -9 },
433
+        {  33,  -9 }, {  39,  -7 }, {  41,  -2 }, {  45,   3 },
434
+        {  49,   9 }, {  45,  27 }, {  36,  59 }, {  21, -13 },
435
+        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
436
+        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
437
+        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
438
+        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
439
+        {  -8,  66 }, {  -8,  76 }, {  -4,  79 }, {  -7,  71 },
440
+        {  -5,  69 }, {  -9,  70 }, {  -8,  66 }, { -10,  68 },
441
+        { -19,  73 }, { -12,  69 }, { -16,  70 }, { -15,  67 },
442
+        { -20,  62 }, { -19,  70 }, { -16,  66 }, { -22,  65 },
443
+        { -20,  63 }, {  -5,  85 }, {  -6,  81 }, { -10,  77 },
444
+        {  -7,  81 }, { -17,  80 }, { -18,  73 }, {  -4,  74 },
445
+        { -10,  83 }, {  -9,  71 }, {  -9,  67 }, {  -1,  61 },
446
+        {  -8,  66 }, { -14,  66 }, {   0,  59 }, {   2,  59 },
447
+        {   9,  -2 }, {  26,  -9 }, {  33,  -9 }, {  39,  -7 },
448
+        {  41,  -2 }, {  45,   3 }, {  49,   9 }, {  45,  27 },
449
+        {  36,  59 }, {  21, -13 }, {  33, -14 }, {  39,  -7 },
450
+        {  46,  -2 }, {  51,   2 }, {  60,   6 }, {  61,  17 },
451
+        {  55,  34 }, {  42,  62 }, {  -6,  66 }, {  -7,  35 },
452
+        {  -7,  42 }, {  -8,  45 }, {  -5,  48 }, { -12,  56 },
453
+        {  -6,  60 }, {  -5,  62 }, {  -8,  66 }, {  -8,  76 },
454
+        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
455
+        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
456
+        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
457
+        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
458
+        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
459
+        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
460
+        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
461
+        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
462
+        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
463
+        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
464
+        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
465
+        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
466
+        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
467
+        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
468
+        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
469
+        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
470
+        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
471
+        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
472
+        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
473
+        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
474
+        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
475
+        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
476
+        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
477
+        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
478
+        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
479
+        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
480
+        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
481
+        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
482
+        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
483
+        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
484
+        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
485
+        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
486
+        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
487
+        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
488
+        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
489
+        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
490
+        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
491
+        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
492
+        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
493
+        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
494
+        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
495
+        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
496
+        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
497
+        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
498
+        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
499
+        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
500
+        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
501
+        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
502
+        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
503
+        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
504
+        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
505
+        {  -3,  74 }, { -10,  90 }, {  -6,  76 }, {  -2,  44 },
506
+        {   0,  45 }, {   0,  52 }, {  -3,  64 }, {  -2,  59 },
507
+        {  -4,  70 }, {  -4,  75 }, {  -8,  82 }, { -17, 102 },
508
+        {  -9,  77 }, {   3,  24 }, {   0,  42 }, {   0,  48 },
509
+        {   0,  55 }, {  -6,  59 }, {  -7,  71 }, { -12,  83 },
510
+        { -11,  87 }, { -30, 119 }, {   1,  58 }, {  -3,  29 },
511
+        {  -1,  36 }, {   1,  38 }, {   2,  43 }, {  -6,  55 },
512
+        {   0,  58 }, {   0,  64 }, {  -3,  74 }, { -10,  90 },
513
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
514
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
515
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 }
373 516
     },
374 517
 
375 518
     /* i_cabac_init_idc == 1 */
... ...
@@ -526,6 +812,149 @@ static const int8_t cabac_context_init_PB[3][460][2] =
526 526
         {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
527 527
         {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
528 528
         {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
529
+
530
+        /* 460 - 1024 */
531
+        {   0,  80 }, {  -5,  89 }, {  -7,  94 }, {  -4,  92 },
532
+        {   0,  39 }, {   0,  65 }, { -15,  84 }, { -35, 127 },
533
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
534
+        {   0,  80 }, {  -5,  89 }, {  -7,  94 }, {  -4,  92 },
535
+        {   0,  39 }, {   0,  65 }, { -15,  84 }, { -35, 127 },
536
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
537
+        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
538
+        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
539
+        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
540
+        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
541
+        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
542
+        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
543
+        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
544
+        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
545
+        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
546
+        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
547
+        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
548
+        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
549
+        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
550
+        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
551
+        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
552
+        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
553
+        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
554
+        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
555
+        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
556
+        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
557
+        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
558
+        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
559
+        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
560
+        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
561
+        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
562
+        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
563
+        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
564
+        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
565
+        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
566
+        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
567
+        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
568
+        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
569
+        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
570
+        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
571
+        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
572
+        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
573
+        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
574
+        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
575
+        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
576
+        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
577
+        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
578
+        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
579
+        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
580
+        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
581
+        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
582
+        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
583
+        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
584
+        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  -3,  81 },
585
+        {  -3,  76 }, {  -7,  72 }, {  -6,  78 }, { -12,  72 },
586
+        { -14,  68 }, {  -3,  70 }, {  -6,  76 }, {  -5,  66 },
587
+        {  -5,  62 }, {   0,  57 }, {  -4,  61 }, {  -9,  60 },
588
+        {   1,  54 }, {   2,  58 }, {  17, -10 }, {  32, -13 },
589
+        {  42,  -9 }, {  49,  -5 }, {  53,   0 }, {  64,   3 },
590
+        {  68,  10 }, {  66,  27 }, {  47,  57 }, {  17, -10 },
591
+        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
592
+        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
593
+        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
594
+        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
595
+        {  -4,  67 }, {  -7,  82 }, {  -5,  85 }, {  -6,  81 },
596
+        { -10,  77 }, {  -7,  81 }, { -17,  80 }, { -18,  73 },
597
+        {  -4,  74 }, { -10,  83 }, {  -9,  71 }, {  -9,  67 },
598
+        {  -1,  61 }, {  -8,  66 }, { -14,  66 }, {   0,  59 },
599
+        {   2,  59 }, {  -3,  81 }, {  -3,  76 }, {  -7,  72 },
600
+        {  -6,  78 }, { -12,  72 }, { -14,  68 }, {  -3,  70 },
601
+        {  -6,  76 }, {  -5,  66 }, {  -5,  62 }, {   0,  57 },
602
+        {  -4,  61 }, {  -9,  60 }, {   1,  54 }, {   2,  58 },
603
+        {  17, -10 }, {  32, -13 }, {  42,  -9 }, {  49,  -5 },
604
+        {  53,   0 }, {  64,   3 }, {  68,  10 }, {  66,  27 },
605
+        {  47,  57 }, {  17, -10 }, {  32, -13 }, {  42,  -9 },
606
+        {  49,  -5 }, {  53,   0 }, {  64,   3 }, {  68,  10 },
607
+        {  66,  27 }, {  47,  57 }, {  -5,  71 }, {   0,  24 },
608
+        {  -1,  36 }, {  -2,  42 }, {  -2,  52 }, {  -9,  57 },
609
+        {  -6,  63 }, {  -4,  65 }, {  -4,  67 }, {  -7,  82 },
610
+        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
611
+        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
612
+        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
613
+        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
614
+        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
615
+        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
616
+        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
617
+        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
618
+        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
619
+        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
620
+        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
621
+        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
622
+        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
623
+        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
624
+        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
625
+        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
626
+        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
627
+        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
628
+        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
629
+        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
630
+        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
631
+        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
632
+        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
633
+        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
634
+        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
635
+        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
636
+        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
637
+        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
638
+        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
639
+        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
640
+        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
641
+        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
642
+        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
643
+        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
644
+        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
645
+        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
646
+        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
647
+        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
648
+        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
649
+        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
650
+        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
651
+        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
652
+        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
653
+        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
654
+        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
655
+        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
656
+        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
657
+        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
658
+        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
659
+        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
660
+        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
661
+        {  -5,  74 }, {  -9,  86 }, { -23, 112 }, { -15,  71 },
662
+        {  -7,  61 }, {   0,  53 }, {  -5,  66 }, { -11,  77 },
663
+        {  -9,  80 }, {  -9,  84 }, { -10,  87 }, { -34, 127 },
664
+        { -21, 101 }, {  -3,  39 }, {  -5,  53 }, {  -7,  61 },
665
+        { -11,  75 }, { -15,  77 }, { -17,  91 }, { -25, 107 },
666
+        { -25, 111 }, { -28, 122 }, { -11,  76 }, { -10,  44 },
667
+        { -10,  52 }, { -10,  57 }, {  -9,  58 }, { -16,  72 },
668
+        {  -7,  69 }, {  -4,  69 }, {  -5,  74 }, {  -9,  86 },
669
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
670
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
671
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 }
529 672
     },
530 673
 
531 674
     /* i_cabac_init_idc == 2 */
... ...
@@ -682,6 +1111,149 @@ static const int8_t cabac_context_init_PB[3][460][2] =
682 682
         { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
683 683
         {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
684 684
         {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
685
+
686
+        /* 460 - 1024 */
687
+        {  11,  80 }, {   5,  76 }, {   2,  84 }, {   5,  78 },
688
+        {  -6,  55 }, {   4,  61 }, { -14,  83 }, { -37, 127 },
689
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
690
+        {  11,  80 }, {   5,  76 }, {   2,  84 }, {   5,  78 },
691
+        {  -6,  55 }, {   4,  61 }, { -14,  83 }, { -37, 127 },
692
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
693
+        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
694
+        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
695
+        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
696
+        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
697
+        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
698
+        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
699
+        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
700
+        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
701
+        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
702
+        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
703
+        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
704
+        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
705
+        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
706
+        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
707
+        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
708
+        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
709
+        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
710
+        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
711
+        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
712
+        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
713
+        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
714
+        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
715
+        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
716
+        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
717
+        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
718
+        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
719
+        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
720
+        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
721
+        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
722
+        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
723
+        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
724
+        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
725
+        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
726
+        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
727
+        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
728
+        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
729
+        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
730
+        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
731
+        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
732
+        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
733
+        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
734
+        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
735
+        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
736
+        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
737
+        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
738
+        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
739
+        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
740
+        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {  -3,  78 },
741
+        {  -8,  74 }, {  -9,  72 }, { -10,  72 }, { -18,  75 },
742
+        { -12,  71 }, { -11,  63 }, {  -5,  70 }, { -17,  75 },
743
+        { -14,  72 }, { -16,  67 }, {  -8,  53 }, { -14,  59 },
744
+        {  -9,  52 }, { -11,  68 }, {   9,  -2 }, {  30, -10 },
745
+        {  31,  -4 }, {  33,  -1 }, {  33,   7 }, {  31,  12 },
746
+        {  37,  23 }, {  31,  38 }, {  20,  64 }, {   9,  -2 },
747
+        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
748
+        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
749
+        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
750
+        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
751
+        {  -6,  68 }, { -10,  79 }, {  -3,  78 }, {  -8,  74 },
752
+        {  -9,  72 }, { -10,  72 }, { -18,  75 }, { -12,  71 },
753
+        { -11,  63 }, {  -5,  70 }, { -17,  75 }, { -14,  72 },
754
+        { -16,  67 }, {  -8,  53 }, { -14,  59 }, {  -9,  52 },
755
+        { -11,  68 }, {  -3,  78 }, {  -8,  74 }, {  -9,  72 },
756
+        { -10,  72 }, { -18,  75 }, { -12,  71 }, { -11,  63 },
757
+        {  -5,  70 }, { -17,  75 }, { -14,  72 }, { -16,  67 },
758
+        {  -8,  53 }, { -14,  59 }, {  -9,  52 }, { -11,  68 },
759
+        {   9,  -2 }, {  30, -10 }, {  31,  -4 }, {  33,  -1 },
760
+        {  33,   7 }, {  31,  12 }, {  37,  23 }, {  31,  38 },
761
+        {  20,  64 }, {   9,  -2 }, {  30, -10 }, {  31,  -4 },
762
+        {  33,  -1 }, {  33,   7 }, {  31,  12 }, {  37,  23 },
763
+        {  31,  38 }, {  20,  64 }, {  -9,  71 }, {  -7,  37 },
764
+        {  -8,  44 }, { -11,  49 }, { -10,  56 }, { -12,  59 },
765
+        {  -8,  63 }, {  -9,  67 }, {  -6,  68 }, { -10,  79 },
766
+        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
767
+        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
768
+        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
769
+        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
770
+        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
771
+        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
772
+        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
773
+        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
774
+        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
775
+        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
776
+        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
777
+        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
778
+        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
779
+        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
780
+        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
781
+        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
782
+        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
783
+        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
784
+        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
785
+        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
786
+        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
787
+        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
788
+        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
789
+        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
790
+        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
791
+        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
792
+        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
793
+        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
794
+        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
795
+        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
796
+        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
797
+        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
798
+        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
799
+        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
800
+        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
801
+        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
802
+        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
803
+        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
804
+        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
805
+        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
806
+        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
807
+        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
808
+        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
809
+        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
810
+        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
811
+        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
812
+        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
813
+        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
814
+        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
815
+        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
816
+        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
817
+        { -12,  92 }, { -18, 108 }, { -24, 115 }, { -22,  82 },
818
+        {  -9,  62 }, {   0,  53 }, {   0,  59 }, { -14,  85 },
819
+        { -13,  89 }, { -13,  94 }, { -11,  92 }, { -29, 127 },
820
+        { -21, 100 }, { -14,  57 }, { -12,  67 }, { -11,  71 },
821
+        { -10,  77 }, { -21,  85 }, { -16,  88 }, { -23, 104 },
822
+        { -15,  98 }, { -37, 127 }, { -10,  82 }, {  -8,  48 },
823
+        {  -8,  61 }, {  -8,  66 }, {  -7,  70 }, { -14,  75 },
824
+        { -10,  79 }, {  -9,  83 }, { -12,  92 }, { -18, 108 },
825
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
826
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
827
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 }
685 828
     }
686 829
 };
687 830
 
... ...
@@ -695,7 +1267,7 @@ void ff_h264_init_cabac_states(H264Context *h) {
695 695
     else                                 tab = cabac_context_init_PB[h->cabac_init_idc];
696 696
 
697 697
     /* calculate pre-state */
698
-    for( i= 0; i < 460; i++ ) {
698
+    for( i= 0; i < 1024; i++ ) {
699 699
         int pre = 2*(((tab[i][0] * slice_qp) >>4 ) + tab[i][1]) - 127;
700 700
 
701 701
         pre^= pre>>31;
... ...
@@ -957,21 +1529,22 @@ static int decode_cabac_mb_mvd( H264Context *h, int ctxbase, int amvd, int *mvda
957 957
     my += decode_cabac_mb_mvd( h, 47, amvd1, &mpy );\
958 958
 }
959 959
 
960
-static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
960
+static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int max_coeff, int is_dc ) {
961 961
     int nza, nzb;
962 962
     int ctx = 0;
963
+    static const uint16_t base_ctx[14] = {85,89,93,97,101,1012,460,464,468,1016,472,476,480,1020};
963 964
 
964 965
     if( is_dc ) {
965
-        if( cat == 0 ) {
966
-            nza = h->left_cbp&0x100;
967
-            nzb = h-> top_cbp&0x100;
968
-        } else {
966
+        if( cat == 3 ) {
969 967
             idx -= CHROMA_DC_BLOCK_INDEX;
970 968
             nza = (h->left_cbp>>(6+idx))&0x01;
971 969
             nzb = (h-> top_cbp>>(6+idx))&0x01;
970
+        } else {
971
+            idx -= LUMA_DC_BLOCK_INDEX;
972
+            nza = h->left_cbp&(0x100<<idx);
973
+            nzb = h-> top_cbp&(0x100<<idx);
972 974
         }
973 975
     } else {
974
-        assert(cat == 1 || cat == 2 || cat == 4);
975 976
         nza = h->non_zero_count_cache[scan8[idx] - 1];
976 977
         nzb = h->non_zero_count_cache[scan8[idx] - 8];
977 978
     }
... ...
@@ -982,7 +1555,7 @@ static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx,
982 982
     if( nzb > 0 )
983 983
         ctx += 2;
984 984
 
985
-    return ctx + 4 * cat;
985
+    return base_ctx[cat] + ctx;
986 986
 }
987 987
 
988 988
 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = {
... ...
@@ -993,16 +1566,16 @@ DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = {
993 993
 };
994 994
 
995 995
 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
996
-    static const int significant_coeff_flag_offset[2][6] = {
997
-      { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
998
-      { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
996
+    static const int significant_coeff_flag_offset[2][14] = {
997
+      { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 },
998
+      { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 }
999 999
     };
1000
-    static const int last_coeff_flag_offset[2][6] = {
1001
-      { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
1002
-      { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
1000
+    static const int last_coeff_flag_offset[2][14] = {
1001
+      { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748 },
1002
+      { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757 }
1003 1003
     };
1004
-    static const int coeff_abs_level_m1_offset[6] = {
1005
-        227+0, 227+10, 227+20, 227+30, 227+39, 426
1004
+    static const int coeff_abs_level_m1_offset[14] = {
1005
+        227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766
1006 1006
     };
1007 1007
     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
1008 1008
       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
... ...
@@ -1057,7 +1630,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
1057 1057
     abs_level_m1_ctx_base = h->cabac_state
1058 1058
         + coeff_abs_level_m1_offset[cat];
1059 1059
 
1060
-    if( !is_dc && cat == 5 ) {
1060
+    if( !is_dc && max_coeff == 64 ) {
1061 1061
 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
1062 1062
         for(last= 0; last < coefs; last++) { \
1063 1063
             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
... ...
@@ -1075,9 +1648,11 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
1075 1075
         }
1076 1076
         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
1077 1077
 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
1078
-        coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
1078
+        coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index,
1079
+                                                 last_coeff_ctx_base-significant_coeff_ctx_base, sig_off);
1079 1080
     } else {
1080
-        coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
1081
+        coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index,
1082
+                                             last_coeff_ctx_base-significant_coeff_ctx_base);
1081 1083
 #else
1082 1084
         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
1083 1085
     } else {
... ...
@@ -1087,16 +1662,16 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
1087 1087
     assert(coeff_count > 0);
1088 1088
 
1089 1089
     if( is_dc ) {
1090
-        if( cat == 0 )
1091
-            h->cbp_table[h->mb_xy] |= 0x100;
1092
-        else
1090
+        if( cat == 3 )
1093 1091
             h->cbp_table[h->mb_xy] |= 0x40 << (n - CHROMA_DC_BLOCK_INDEX);
1092
+        else
1093
+            h->cbp_table[h->mb_xy] |= 0x100 << (n - LUMA_DC_BLOCK_INDEX);
1094 1094
         h->non_zero_count_cache[scan8[n]] = coeff_count;
1095 1095
     } else {
1096
-        if( cat == 5 )
1096
+        if( max_coeff == 64 )
1097 1097
             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
1098 1098
         else {
1099
-            assert( cat == 1 || cat == 2 || cat == 4 );
1099
+            assert( cat == 1 || cat ==  2 || cat ==  4 || cat == 7 || cat == 8 || cat == 11 || cat == 12 );
1100 1100
             h->non_zero_count_cache[scan8[n]] = coeff_count;
1101 1101
         }
1102 1102
     }
... ...
@@ -1180,7 +1755,7 @@ static void decode_cabac_residual_nondc_internal( H264Context *h, DCTELEM *block
1180 1180
 
1181 1181
 static av_always_inline void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) {
1182 1182
     /* read coded block flag */
1183
-    if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, 1 ) ] ) == 0 ) {
1183
+    if( get_cabac( &h->cabac, &h->cabac_state[get_cabac_cbf_ctx( h, cat, n, max_coeff, 1 ) ] ) == 0 ) {
1184 1184
         h->non_zero_count_cache[scan8[n]] = 0;
1185 1185
         return;
1186 1186
     }
... ...
@@ -1189,13 +1764,68 @@ static av_always_inline void decode_cabac_residual_dc( H264Context *h, DCTELEM *
1189 1189
 
1190 1190
 static av_always_inline void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
1191 1191
     /* read coded block flag */
1192
-    if( cat != 5 && get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, 0 ) ] ) == 0 ) {
1193
-        h->non_zero_count_cache[scan8[n]] = 0;
1192
+    if( (cat != 5 || CHROMA444) && get_cabac( &h->cabac, &h->cabac_state[get_cabac_cbf_ctx( h, cat, n, max_coeff, 0 ) ] ) == 0 ) {
1193
+        if( max_coeff == 64 ) {
1194
+            fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 0, 1);
1195
+        } else {
1196
+            h->non_zero_count_cache[scan8[n]] = 0;
1197
+        }
1194 1198
         return;
1195 1199
     }
1196 1200
     decode_cabac_residual_nondc_internal( h, block, cat, n, scantable, qmul, max_coeff );
1197 1201
 }
1198 1202
 
1203
+static av_always_inline void decode_cabac_luma_residual( H264Context *h, const uint8_t *scan, const uint8_t *scan8x8, int pixel_shift, int mb_type, int cbp, int p )
1204
+{
1205
+    static const uint8_t ctx_cat[4][3] = {{0,6,10},{1,7,11},{2,8,12},{5,9,13}};
1206
+    const uint32_t *qmul;
1207
+    int i8x8, i4x4;
1208
+    MpegEncContext * const s = &h->s;
1209
+    int qscale = p == 0 ? s->qscale : h->chroma_qp[p-1];
1210
+    if( IS_INTRA16x16( mb_type ) ) {
1211
+        //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
1212
+        AV_ZERO128(h->mb_luma_dc[p]+0);
1213
+        AV_ZERO128(h->mb_luma_dc[p]+8);
1214
+        AV_ZERO128(h->mb_luma_dc[p]+16);
1215
+        AV_ZERO128(h->mb_luma_dc[p]+24);
1216
+        decode_cabac_residual_dc(h, h->mb_luma_dc[p], ctx_cat[0][p], LUMA_DC_BLOCK_INDEX+p, scan, 16);
1217
+
1218
+        if( cbp&15 ) {
1219
+            qmul = h->dequant4_coeff[p][qscale];
1220
+            for( i4x4 = 0; i4x4 < 16; i4x4++ ) {
1221
+                const int index = 16*p + i4x4;
1222
+                //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", index );
1223
+                decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), ctx_cat[1][p], index, scan + 1, qmul, 15);
1224
+            }
1225
+        } else {
1226
+            fill_rectangle(&h->non_zero_count_cache[scan8[16*p]], 4, 4, 8, 0, 1);
1227
+        }
1228
+    } else {
1229
+        int cqm = (IS_INTRA( mb_type ) ? 0:3) + p;
1230
+        for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
1231
+            if( cbp & (1<<i8x8) ) {
1232
+                if( IS_8x8DCT(mb_type) ) {
1233
+                    const int index = 16*p + 4*i8x8;
1234
+                    decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), ctx_cat[3][p], index,
1235
+                                                scan8x8, h->dequant8_coeff[cqm][qscale], 64);
1236
+                } else {
1237
+                    qmul = h->dequant4_coeff[cqm][qscale];
1238
+                    for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
1239
+                        const int index = 16*p + 4*i8x8 + i4x4;
1240
+                        //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
1241
+//START_TIMER
1242
+                        decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), ctx_cat[2][p], index, scan, qmul, 16);
1243
+//STOP_TIMER("decode_residual")
1244
+                    }
1245
+                }
1246
+            } else {
1247
+                uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8+16*p] ];
1248
+                nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
1249
+            }
1250
+        }
1251
+    }
1252
+}
1253
+
1199 1254
 /**
1200 1255
  * decodes a macroblock
1201 1256
  * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
... ...
@@ -1205,6 +1835,7 @@ int ff_h264_decode_mb_cabac(H264Context *h) {
1205 1205
     int mb_xy;
1206 1206
     int mb_type, partition_count, cbp = 0;
1207 1207
     int dct8x8_allowed= h->pps.transform_8x8_mode;
1208
+    int decode_chroma = h->sps.chroma_format_idc == 1 || h->sps.chroma_format_idc == 2;
1208 1209
     const int pixel_shift = h->pixel_shift;
1209 1210
 
1210 1211
     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
... ...
@@ -1314,7 +1945,8 @@ decode_intra_mb:
1314 1314
     h->slice_table[ mb_xy ]= h->slice_num;
1315 1315
 
1316 1316
     if(IS_INTRA_PCM(mb_type)) {
1317
-        const int mb_size = (384*h->sps.bit_depth_luma) >> 3;
1317
+        static const uint16_t mb_sizes[4] = {256,384,512,768};
1318
+        const int mb_size = mb_sizes[h->sps.chroma_format_idc]*h->sps.bit_depth_luma >> 3;
1318 1319
         const uint8_t *ptr;
1319 1320
 
1320 1321
         // We assume these blocks are very rare so we do not optimize it.
... ...
@@ -1327,20 +1959,17 @@ decode_intra_mb:
1327 1327
         }
1328 1328
 
1329 1329
         // The pixels are stored in the same order as levels in h->mb array.
1330
-        memcpy(h->mb, ptr, 2*mb_size/3); ptr+=2*mb_size/3;
1331
-        if(CHROMA){
1332
-            memcpy(h->mb+mb_size/3, ptr, mb_size/3); ptr+=mb_size/3;
1333
-        }
1330
+        memcpy(h->mb, ptr, mb_size); ptr+=mb_size;
1334 1331
 
1335 1332
         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
1336 1333
 
1337 1334
         // All blocks are present
1338
-        h->cbp_table[mb_xy] = 0x1ef;
1335
+        h->cbp_table[mb_xy] = 0xf7ef;
1339 1336
         h->chroma_pred_mode_table[mb_xy] = 0;
1340 1337
         // In deblocking, the quantizer is 0
1341 1338
         s->current_picture.qscale_table[mb_xy]= 0;
1342 1339
         // All coeffs are present
1343
-        memset(h->non_zero_count[mb_xy], 16, 32);
1340
+        memset(h->non_zero_count[mb_xy], 16, 48);
1344 1341
         s->current_picture.mb_type[mb_xy]= mb_type;
1345 1342
         h->last_qscale_diff = 0;
1346 1343
         return 0;
... ...
@@ -1377,7 +2006,7 @@ decode_intra_mb:
1377 1377
             h->intra16x16_pred_mode= ff_h264_check_intra_pred_mode( h, h->intra16x16_pred_mode );
1378 1378
             if( h->intra16x16_pred_mode < 0 ) return -1;
1379 1379
         }
1380
-        if(CHROMA){
1380
+        if(decode_chroma){
1381 1381
             h->chroma_pred_mode_table[mb_xy] =
1382 1382
             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
1383 1383
 
... ...
@@ -1606,7 +2235,7 @@ decode_intra_mb:
1606 1606
 
1607 1607
     if( !IS_INTRA16x16( mb_type ) ) {
1608 1608
         cbp  = decode_cabac_mb_cbp_luma( h );
1609
-        if(CHROMA)
1609
+        if(decode_chroma)
1610 1610
             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
1611 1611
     }
1612 1612
 
... ...
@@ -1615,6 +2244,28 @@ decode_intra_mb:
1615 1615
     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
1616 1616
         mb_type |= MB_TYPE_8x8DCT * get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
1617 1617
     }
1618
+
1619
+    /* It would be better to do this in fill_decode_caches, but we don't know
1620
+     * the transform mode of the current macroblock there. */
1621
+    if (CHROMA444 && IS_8x8DCT(mb_type)){
1622
+        int i;
1623
+        for (i = 0; i < 2; i++){
1624
+            if (h->left_type[i] && !IS_8x8DCT(h->left_type[i])){
1625
+                h->non_zero_count_cache[3+8* 1 + 2*8*i]=
1626
+                h->non_zero_count_cache[3+8* 2 + 2*8*i]=
1627
+                h->non_zero_count_cache[3+8* 6 + 2*8*i]=
1628
+                h->non_zero_count_cache[3+8* 7 + 2*8*i]=
1629
+                h->non_zero_count_cache[3+8*11 + 2*8*i]=
1630
+                h->non_zero_count_cache[3+8*12 + 2*8*i]= IS_INTRA(mb_type) ? 64 : 0;
1631
+            }
1632
+        }
1633
+        if (h->top_type && !IS_8x8DCT(h->top_type)){
1634
+            uint32_t top_empty = CABAC && !IS_INTRA(mb_type) ? 0 : 0x40404040;
1635
+            AV_WN32A(&h->non_zero_count_cache[4+8* 0], top_empty);
1636
+            AV_WN32A(&h->non_zero_count_cache[4+8* 5], top_empty);
1637
+            AV_WN32A(&h->non_zero_count_cache[4+8*10], top_empty);
1638
+        }
1639
+    }
1618 1640
     s->current_picture.mb_type[mb_xy]= mb_type;
1619 1641
 
1620 1642
     if( cbp || IS_INTRA16x16( mb_type ) ) {
... ...
@@ -1659,76 +2310,38 @@ decode_intra_mb:
1659 1659
         }else
1660 1660
             h->last_qscale_diff=0;
1661 1661
 
1662
-        if( IS_INTRA16x16( mb_type ) ) {
1663
-            int i;
1664
-            //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
1665
-            AV_ZERO128(h->mb_luma_dc+0);
1666
-            AV_ZERO128(h->mb_luma_dc+8);
1667
-            AV_ZERO128(h->mb_luma_dc+16);
1668
-            AV_ZERO128(h->mb_luma_dc+24);
1669
-            decode_cabac_residual_dc( h, h->mb_luma_dc, 0, LUMA_DC_BLOCK_INDEX, scan, 16);
1670
-
1671
-            if( cbp&15 ) {
1672
-                qmul = h->dequant4_coeff[0][s->qscale];
1673
-                for( i = 0; i < 16; i++ ) {
1674
-                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
1675
-                    decode_cabac_residual_nondc(h, h->mb + (16*i << pixel_shift), 1, i, scan + 1, qmul, 15);
1676
-                }
1677
-            } else {
1678
-                fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
1679
-            }
1662
+        decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 0);
1663
+        if(CHROMA444){
1664
+            decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 1);
1665
+            decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 2);
1680 1666
         } else {
1681
-            int i8x8, i4x4;
1682
-            for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
1683
-                if( cbp & (1<<i8x8) ) {
1684
-                    if( IS_8x8DCT(mb_type) ) {
1685
-                        decode_cabac_residual_nondc(h, h->mb + (64*i8x8 << pixel_shift), 5, 4*i8x8,
1686
-                            scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
1687
-                    } else {
1688
-                        qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
1689
-                        for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
1690
-                            const int index = 4*i8x8 + i4x4;
1691
-                            //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
1692
-//START_TIMER
1693
-                            decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 2, index, scan, qmul, 16);
1694
-//STOP_TIMER("decode_residual")
1695
-                        }
1696
-                    }
1697
-                } else {
1698
-                    uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
1699
-                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
1667
+            if( cbp&0x30 ){
1668
+                int c;
1669
+                for( c = 0; c < 2; c++ ) {
1670
+                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
1671
+                    decode_cabac_residual_dc(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
1700 1672
                 }
1701 1673
             }
1702
-        }
1703 1674
 
1704
-        if( cbp&0x30 ){
1705
-            int c;
1706
-            for( c = 0; c < 2; c++ ) {
1707
-                //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
1708
-                decode_cabac_residual_dc(h, h->mb + ((256 + 16*4*c) << pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
1709
-            }
1710
-        }
1711
-
1712
-        if( cbp&0x20 ) {
1713
-            int c, i;
1714
-            for( c = 0; c < 2; c++ ) {
1715
-                qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
1716
-                for( i = 0; i < 4; i++ ) {
1717
-                    const int index = 16 + 4 * c + i;
1718
-                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
1719
-                    decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 4, index, scan + 1, qmul, 15);
1675
+            if( cbp&0x20 ) {
1676
+                int c, i;
1677
+                for( c = 0; c < 2; c++ ) {
1678
+                    qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
1679
+                    for( i = 0; i < 4; i++ ) {
1680
+                        const int index = 16 + 16 * c + i;
1681
+                        //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
1682
+                        decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 4, index, scan + 1, qmul, 15);
1683
+                    }
1720 1684
                 }
1685
+            } else {
1686
+                fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
1687
+                fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
1721 1688
             }
1722
-        } else {
1723
-            uint8_t * const nnz= &h->non_zero_count_cache[0];
1724
-            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
1725
-            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
1726 1689
         }
1727 1690
     } else {
1728
-        uint8_t * const nnz= &h->non_zero_count_cache[0];
1729
-        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
1730
-        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
1731
-        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
1691
+        fill_rectangle(&h->non_zero_count_cache[scan8[ 0]], 4, 4, 8, 0, 1);
1692
+        fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
1693
+        fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
1732 1694
         h->last_qscale_diff = 0;
1733 1695
     }
1734 1696
 
... ...
@@ -371,12 +371,12 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
371 371
 
372 372
     //FIXME put trailing_onex into the context
373 373
 
374
-    if(n >= CHROMA_DC_BLOCK_INDEX){
374
+    if(max_coeff <= 8){
375 375
         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
376 376
         total_coeff= coeff_token>>2;
377 377
     }else{
378
-        if(n == LUMA_DC_BLOCK_INDEX){
379
-            total_coeff= pred_non_zero_count(h, 0);
378
+        if(n >= LUMA_DC_BLOCK_INDEX){
379
+            total_coeff= pred_non_zero_count(h, (n - LUMA_DC_BLOCK_INDEX)*16);
380 380
             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
381 381
             total_coeff= coeff_token>>2;
382 382
         }else{
... ...
@@ -482,7 +482,8 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
482 482
     if(total_coeff == max_coeff)
483 483
         zeros_left=0;
484 484
     else{
485
-        if(n >= CHROMA_DC_BLOCK_INDEX)
485
+        /* FIXME: we don't actually support 4:2:2 yet. */
486
+        if(max_coeff <= 8)
486 487
             zeros_left= get_vlc2(gb, (chroma_dc_total_zeros_vlc-1)[ total_coeff ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
487 488
         else
488 489
             zeros_left= get_vlc2(gb, (total_zeros_vlc-1)[ total_coeff ].table, TOTAL_ZEROS_VLC_BITS, 1);
... ...
@@ -536,12 +537,80 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
536 536
     return 0;
537 537
 }
538 538
 
539
+static av_always_inline int decode_luma_residual(H264Context *h, GetBitContext *gb, const uint8_t *scan, const uint8_t *scan8x8, int pixel_shift, int mb_type, int cbp, int p){
540
+    int i4x4, i8x8;
541
+    MpegEncContext * const s = &h->s;
542
+    int qscale = p == 0 ? s->qscale : h->chroma_qp[p-1];
543
+    if(IS_INTRA16x16(mb_type)){
544
+        AV_ZERO128(h->mb_luma_dc[p]+0);
545
+        AV_ZERO128(h->mb_luma_dc[p]+8);
546
+        AV_ZERO128(h->mb_luma_dc[p]+16);
547
+        AV_ZERO128(h->mb_luma_dc[p]+24);
548
+        if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc[p], LUMA_DC_BLOCK_INDEX+p, scan, NULL, 16) < 0){
549
+            return -1; //FIXME continue if partitioned and other return -1 too
550
+        }
551
+
552
+        assert((cbp&15) == 0 || (cbp&15) == 15);
553
+
554
+        if(cbp&15){
555
+            for(i8x8=0; i8x8<4; i8x8++){
556
+                for(i4x4=0; i4x4<4; i4x4++){
557
+                    const int index= i4x4 + 4*i8x8 + p*16;
558
+                    if( decode_residual(h, h->intra_gb_ptr, h->mb + (16*index << pixel_shift),
559
+                        index, scan + 1, h->dequant4_coeff[p][qscale], 15) < 0 ){
560
+                        return -1;
561
+                    }
562
+                }
563
+            }
564
+            return 0xf;
565
+        }else{
566
+            fill_rectangle(&h->non_zero_count_cache[scan8[p*16]], 4, 4, 8, 0, 1);
567
+            return 0;
568
+        }
569
+    }else{
570
+        int cqm = (IS_INTRA( mb_type ) ? 0:3)+p;
571
+        /* For CAVLC 4:4:4, we need to keep track of the luma 8x8 CBP for deblocking nnz purposes. */
572
+        int new_cbp = 0;
573
+        for(i8x8=0; i8x8<4; i8x8++){
574
+            if(cbp & (1<<i8x8)){
575
+                if(IS_8x8DCT(mb_type)){
576
+                    DCTELEM *buf = &h->mb[64*i8x8+256*p << pixel_shift];
577
+                    uint8_t *nnz;
578
+                    for(i4x4=0; i4x4<4; i4x4++){
579
+                        const int index= i4x4 + 4*i8x8 + p*16;
580
+                        if( decode_residual(h, gb, buf, index, scan8x8+16*i4x4,
581
+                                            h->dequant8_coeff[cqm][qscale], 16) < 0 )
582
+                            return -1;
583
+                    }
584
+                    nnz= &h->non_zero_count_cache[ scan8[4*i8x8+p*16] ];
585
+                    nnz[0] += nnz[1] + nnz[8] + nnz[9];
586
+                    new_cbp |= !!nnz[0] << i8x8;
587
+                }else{
588
+                    for(i4x4=0; i4x4<4; i4x4++){
589
+                        const int index= i4x4 + 4*i8x8 + p*16;
590
+                        if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index,
591
+                                            scan, h->dequant4_coeff[cqm][qscale], 16) < 0 ){
592
+                            return -1;
593
+                        }
594
+                        new_cbp |= h->non_zero_count_cache[ scan8[index] ] << i8x8;
595
+                    }
596
+                }
597
+            }else{
598
+                uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8+p*16] ];
599
+                nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
600
+            }
601
+        }
602
+        return new_cbp;
603
+    }
604
+}
605
+
539 606
 int ff_h264_decode_mb_cavlc(H264Context *h){
540 607
     MpegEncContext * const s = &h->s;
541 608
     int mb_xy;
542 609
     int partition_count;
543 610
     unsigned int mb_type, cbp;
544 611
     int dct8x8_allowed= h->pps.transform_8x8_mode;
612
+    int decode_chroma = h->sps.chroma_format_idc == 1 || h->sps.chroma_format_idc == 2;
545 613
     const int pixel_shift = h->pixel_shift;
546 614
 
547 615
     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
... ...
@@ -608,19 +677,21 @@ decode_intra_mb:
608 608
 
609 609
     if(IS_INTRA_PCM(mb_type)){
610 610
         unsigned int x;
611
+        static const uint16_t mb_sizes[4] = {256,384,512,768};
612
+        const int mb_size = mb_sizes[h->sps.chroma_format_idc]*h->sps.bit_depth_luma >> 3;
611 613
 
612 614
         // We assume these blocks are very rare so we do not optimize it.
613 615
         align_get_bits(&s->gb);
614 616
 
615 617
         // The pixels are stored in the same order as levels in h->mb array.
616
-        for(x=0; x < (CHROMA ? 384 : 256)*h->sps.bit_depth_luma/8; x++){
618
+        for(x=0; x < mb_size; x++){
617 619
             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
618 620
         }
619 621
 
620 622
         // In deblocking, the quantizer is 0
621 623
         s->current_picture.qscale_table[mb_xy]= 0;
622 624
         // All coeffs are present
623
-        memset(h->non_zero_count[mb_xy], 16, 32);
625
+        memset(h->non_zero_count[mb_xy], 16, 48);
624 626
 
625 627
         s->current_picture.mb_type[mb_xy]= mb_type;
626 628
         return 0;
... ...
@@ -668,7 +739,7 @@ decode_intra_mb:
668 668
             if(h->intra16x16_pred_mode < 0)
669 669
                 return -1;
670 670
         }
671
-        if(CHROMA){
671
+        if(decode_chroma){
672 672
             pred_mode= ff_h264_check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
673 673
             if(pred_mode < 0)
674 674
                 return -1;
... ...
@@ -896,15 +967,19 @@ decode_intra_mb:
896 896
 
897 897
     if(!IS_INTRA16x16(mb_type)){
898 898
         cbp= get_ue_golomb(&s->gb);
899
-        if(cbp > 47){
900
-            av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
901
-            return -1;
902
-        }
903 899
 
904
-        if(CHROMA){
900
+        if(decode_chroma){
901
+            if(cbp > 47){
902
+                av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
903
+                return -1;
904
+            }
905 905
             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
906 906
             else                     cbp= golomb_to_inter_cbp   [cbp];
907 907
         }else{
908
+            if(cbp > 15){
909
+                av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
910
+                return -1;
911
+            }
908 912
             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
909 913
             else                     cbp= golomb_to_inter_cbp_gray[cbp];
910 914
         }
... ...
@@ -918,8 +993,9 @@ decode_intra_mb:
918 918
     s->current_picture.mb_type[mb_xy]= mb_type;
919 919
 
920 920
     if(cbp || IS_INTRA16x16(mb_type)){
921
-        int i8x8, i4x4, chroma_idx;
921
+        int i4x4, chroma_idx;
922 922
         int dquant;
923
+        int ret;
923 924
         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
924 925
         const uint8_t *scan, *scan8x8;
925 926
         const int max_qp = 51 + 6*(h->sps.bit_depth_luma-8);
... ...
@@ -947,85 +1023,45 @@ decode_intra_mb:
947 947
 
948 948
         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
949 949
         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
950
-        if(IS_INTRA16x16(mb_type)){
951
-            AV_ZERO128(h->mb_luma_dc+0);
952
-            AV_ZERO128(h->mb_luma_dc+8);
953
-            AV_ZERO128(h->mb_luma_dc+16);
954
-            AV_ZERO128(h->mb_luma_dc+24);
955
-            if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
956
-                return -1; //FIXME continue if partitioned and other return -1 too
957
-            }
958 950
 
959
-            assert((cbp&15) == 0 || (cbp&15) == 15);
951
+        if( (ret = decode_luma_residual(h, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 0)) < 0 ){
952
+            return -1;
953
+        }
954
+        h->cbp_table[mb_xy] |= ret << 12;
955
+        if(CHROMA444){
956
+            if( decode_luma_residual(h, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 1) < 0 ){
957
+                return -1;
958
+            }
959
+            if( decode_luma_residual(h, gb, scan, scan8x8, pixel_shift, mb_type, cbp, 2) < 0 ){
960
+                return -1;
961
+            }
962
+        } else {
963
+            if(cbp&0x30){
964
+                for(chroma_idx=0; chroma_idx<2; chroma_idx++)
965
+                    if( decode_residual(h, gb, h->mb + ((256 + 16*16*chroma_idx) << pixel_shift), CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
966
+                        return -1;
967
+                    }
968
+            }
960 969
 
961
-            if(cbp&15){
962
-                for(i8x8=0; i8x8<4; i8x8++){
970
+            if(cbp&0x20){
971
+                for(chroma_idx=0; chroma_idx<2; chroma_idx++){
972
+                    const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
963 973
                     for(i4x4=0; i4x4<4; i4x4++){
964
-                        const int index= i4x4 + 4*i8x8;
965
-                        if( decode_residual(h, h->intra_gb_ptr, h->mb + (16*index << pixel_shift), index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
974
+                        const int index= 16 + 16*chroma_idx + i4x4;
975
+                        if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index, scan + 1, qmul, 15) < 0){
966 976
                             return -1;
967 977
                         }
968 978
                     }
969 979
                 }
970 980
             }else{
971
-                fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
981
+                fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
982
+                fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
972 983
             }
973
-        }else{
974
-            for(i8x8=0; i8x8<4; i8x8++){
975
-                if(cbp & (1<<i8x8)){
976
-                    if(IS_8x8DCT(mb_type)){
977
-                        DCTELEM *buf = &h->mb[64*i8x8 << pixel_shift];
978
-                        uint8_t *nnz;
979
-                        for(i4x4=0; i4x4<4; i4x4++){
980
-                            if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
981
-                                                h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
982
-                                return -1;
983
-                        }
984
-                        nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
985
-                        nnz[0] += nnz[1] + nnz[8] + nnz[9];
986
-                    }else{
987
-                        for(i4x4=0; i4x4<4; i4x4++){
988
-                            const int index= i4x4 + 4*i8x8;
989
-
990
-                            if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
991
-                                return -1;
992
-                            }
993
-                        }
994
-                    }
995
-                }else{
996
-                    uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
997
-                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
998
-                }
999
-            }
1000
-        }
1001
-
1002
-        if(cbp&0x30){
1003
-            for(chroma_idx=0; chroma_idx<2; chroma_idx++)
1004
-                if( decode_residual(h, gb, h->mb + ((256 + 16*4*chroma_idx) << pixel_shift), CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){
1005
-                    return -1;
1006
-                }
1007
-        }
1008
-
1009
-        if(cbp&0x20){
1010
-            for(chroma_idx=0; chroma_idx<2; chroma_idx++){
1011
-                const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
1012
-                for(i4x4=0; i4x4<4; i4x4++){
1013
-                    const int index= 16 + 4*chroma_idx + i4x4;
1014
-                    if( decode_residual(h, gb, h->mb + (16*index << pixel_shift), index, scan + 1, qmul, 15) < 0){
1015
-                        return -1;
1016
-                    }
1017
-                }
1018
-            }
1019
-        }else{
1020
-            uint8_t * const nnz= &h->non_zero_count_cache[0];
1021
-            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
1022
-            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
1023 984
         }
1024 985
     }else{
1025
-        uint8_t * const nnz= &h->non_zero_count_cache[0];
1026
-        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
1027
-        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
1028
-        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
986
+        fill_rectangle(&h->non_zero_count_cache[scan8[ 0]], 4, 4, 8, 0, 1);
987
+        fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
988
+        fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
1029 989
     }
1030 990
     s->current_picture.qscale_table[mb_xy]= s->qscale;
1031 991
     write_back_non_zero_count(h);
... ...
@@ -217,10 +217,11 @@ void ff_h264_filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
217 217
     int mb_xy;
218 218
     int mb_type, left_type;
219 219
     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
220
+    int chroma = !(CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
220 221
 
221 222
     mb_xy = h->mb_xy;
222 223
 
223
-    if(!h->top_type || !h->h264dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff) {
224
+    if(!h->top_type || !h->h264dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff || CHROMA444) {
224 225
         ff_h264_filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
225 226
         return;
226 227
     }
... ...
@@ -262,16 +263,18 @@ void ff_h264_filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
262 262
             filter_mb_edgeh( &img_y[4*2*linesize], linesize, bS3, qp, h);
263 263
             filter_mb_edgeh( &img_y[4*3*linesize], linesize, bS3, qp, h);
264 264
         }
265
-        if(left_type){
266
-            filter_mb_edgecv( &img_cb[2*0], uvlinesize, bS4, qpc0, h);
267
-            filter_mb_edgecv( &img_cr[2*0], uvlinesize, bS4, qpc0, h);
265
+        if(chroma){
266
+            if(left_type){
267
+                filter_mb_edgecv( &img_cb[2*0], uvlinesize, bS4, qpc0, h);
268
+                filter_mb_edgecv( &img_cr[2*0], uvlinesize, bS4, qpc0, h);
269
+            }
270
+            filter_mb_edgecv( &img_cb[2*2], uvlinesize, bS3, qpc, h);
271
+            filter_mb_edgecv( &img_cr[2*2], uvlinesize, bS3, qpc, h);
272
+            filter_mb_edgech( &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
273
+            filter_mb_edgech( &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
274
+            filter_mb_edgech( &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
275
+            filter_mb_edgech( &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
268 276
         }
269
-        filter_mb_edgecv( &img_cb[2*2], uvlinesize, bS3, qpc, h);
270
-        filter_mb_edgecv( &img_cr[2*2], uvlinesize, bS3, qpc, h);
271
-        filter_mb_edgech( &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
272
-        filter_mb_edgech( &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
273
-        filter_mb_edgech( &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1, h);
274
-        filter_mb_edgech( &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc, h);
275 277
         return;
276 278
     } else {
277 279
         LOCAL_ALIGNED_8(int16_t, bS, [2], [4][4]);
... ...
@@ -298,7 +301,7 @@ void ff_h264_filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
298 298
 #define FILTER(hv,dir,edge)\
299 299
         if(AV_RN64A(bS[dir][edge])) {                                   \
300 300
             filter_mb_edge##hv( &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir, h );\
301
-            if(!(edge&1)) {\
301
+            if(chroma && !(edge&1)) {\
302 302
                 filter_mb_edgec##hv( &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir, h );\
303 303
                 filter_mb_edgec##hv( &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir, h );\
304 304
             }\
... ...
@@ -353,9 +356,10 @@ static int check_mv(H264Context *h, long b_idx, long bn_idx, int mvy_limit){
353 353
     return v;
354 354
 }
355 355
 
356
-static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
356
+static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int chroma, int chroma444, int dir) {
357 357
     MpegEncContext * const s = &h->s;
358 358
     int edge;
359
+    int chroma_qp_avg[2];
359 360
     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
360 361
     const int mbm_type = dir == 0 ? h->left_type[0] : h->top_type;
361 362
 
... ...
@@ -394,7 +398,7 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
394 394
                         bS[2]= 1+((h->cbp_table[mbn_xy] & 8)||h->non_zero_count_cache[scan8[0]+2]);
395 395
                         bS[3]= 1+((h->cbp_table[mbn_xy] & 8)||h->non_zero_count_cache[scan8[0]+3]);
396 396
                     }else{
397
-                    const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy] + 4+3*8;
397
+                    const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy] + 3*4;
398 398
                     int i;
399 399
                     for( i = 0; i < 4; i++ ) {
400 400
                         bS[i] = 1 + !!(h->non_zero_count_cache[scan8[0]+i] | mbn_nnz[i]);
... ...
@@ -407,10 +411,17 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
407 407
                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
408 408
                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
409 409
                 filter_mb_edgeh( &img_y[j*linesize], tmp_linesize, bS, qp, h );
410
-                filter_mb_edgech( &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
411
-                                ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1, h);
412
-                filter_mb_edgech( &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
413
-                                ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1, h);
410
+                chroma_qp_avg[0] = (h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
411
+                chroma_qp_avg[1] = (h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
412
+                if (chroma) {
413
+                    if (chroma444) {
414
+                        filter_mb_edgeh (&img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[0], h);
415
+                        filter_mb_edgeh (&img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[1], h);
416
+                    } else {
417
+                        filter_mb_edgech(&img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[0], h);
418
+                        filter_mb_edgech(&img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp_avg[1], h);
419
+                    }
420
+                }
414 421
             }
415 422
         }else{
416 423
             DECLARE_ALIGNED(8, int16_t, bS)[4];
... ...
@@ -465,23 +476,29 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
465 465
                 //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]);
466 466
                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
467 467
                 //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
468
+                chroma_qp_avg[0] = (h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
469
+                chroma_qp_avg[1] = (h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
468 470
                 if( dir == 0 ) {
469 471
                     filter_mb_edgev( &img_y[0], linesize, bS, qp, h );
470
-                    {
471
-                        int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
472
-                        filter_mb_edgecv( &img_cb[0], uvlinesize, bS, qp, h);
473
-                        if(h->pps.chroma_qp_diff)
474
-                            qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
475
-                        filter_mb_edgecv( &img_cr[0], uvlinesize, bS, qp, h);
472
+                    if (chroma) {
473
+                        if (chroma444) {
474
+                            filter_mb_edgev ( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
475
+                            filter_mb_edgev ( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
476
+                        } else {
477
+                            filter_mb_edgecv( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
478
+                            filter_mb_edgecv( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
479
+                        }
476 480
                     }
477 481
                 } else {
478 482
                     filter_mb_edgeh( &img_y[0], linesize, bS, qp, h );
479
-                    {
480
-                        int qp= ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
481
-                        filter_mb_edgech( &img_cb[0], uvlinesize, bS, qp, h);
482
-                        if(h->pps.chroma_qp_diff)
483
-                            qp= ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbm_xy] ) + 1 ) >> 1;
484
-                        filter_mb_edgech( &img_cr[0], uvlinesize, bS, qp, h);
483
+                    if (chroma) {
484
+                        if (chroma444) {
485
+                            filter_mb_edgeh ( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
486
+                            filter_mb_edgeh ( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
487
+                        } else {
488
+                            filter_mb_edgech( &img_cb[0], uvlinesize, bS, chroma_qp_avg[0], h);
489
+                            filter_mb_edgech( &img_cr[0], uvlinesize, bS, chroma_qp_avg[1], h);
490
+                        }
485 491
                     }
486 492
                 }
487 493
             }
... ...
@@ -545,15 +562,25 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
545 545
         //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
546 546
         if( dir == 0 ) {
547 547
             filter_mb_edgev( &img_y[4*edge << h->pixel_shift], linesize, bS, qp, h );
548
-            if( (edge&1) == 0 ) {
549
-                filter_mb_edgecv( &img_cb[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
550
-                filter_mb_edgecv( &img_cr[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
548
+            if (chroma) {
549
+                if (chroma444) {
550
+                    filter_mb_edgev ( &img_cb[4*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
551
+                    filter_mb_edgev ( &img_cr[4*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
552
+                } else if( (edge&1) == 0 ) {
553
+                    filter_mb_edgecv( &img_cb[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[0], h);
554
+                    filter_mb_edgecv( &img_cr[2*edge << h->pixel_shift], uvlinesize, bS, h->chroma_qp[1], h);
555
+                }
551 556
             }
552 557
         } else {
553 558
             filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, h );
554
-            if( (edge&1) == 0 ) {
555
-                filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h);
556
-                filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h);
559
+            if (chroma) {
560
+                if (chroma444) {
561
+                    filter_mb_edgeh ( &img_cb[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h);
562
+                    filter_mb_edgeh ( &img_cr[4*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h);
563
+                } else if( (edge&1) == 0 ) {
564
+                    filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[0], h);
565
+                    filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, h->chroma_qp[1], h);
566
+                }
557 567
             }
558 568
         }
559 569
     }
... ...
@@ -566,6 +593,7 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
566 566
     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
567 567
     int first_vertical_edge_done = 0;
568 568
     av_unused int dir;
569
+    int chroma = !(CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
569 570
 
570 571
     if (FRAME_MBAFF
571 572
             // and current and left pair do not have the same interlaced type
... ...
@@ -589,11 +617,11 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
589 589
         } else {
590 590
             static const uint8_t offset[2][2][8]={
591 591
                 {
592
-                    {7+8*0, 7+8*0, 7+8*0, 7+8*0, 7+8*1, 7+8*1, 7+8*1, 7+8*1},
593
-                    {7+8*2, 7+8*2, 7+8*2, 7+8*2, 7+8*3, 7+8*3, 7+8*3, 7+8*3},
592
+                    {3+4*0, 3+4*0, 3+4*0, 3+4*0, 3+4*1, 3+4*1, 3+4*1, 3+4*1},
593
+                    {3+4*2, 3+4*2, 3+4*2, 3+4*2, 3+4*3, 3+4*3, 3+4*3, 3+4*3},
594 594
                 },{
595
-                    {7+8*0, 7+8*1, 7+8*2, 7+8*3, 7+8*0, 7+8*1, 7+8*2, 7+8*3},
596
-                    {7+8*0, 7+8*1, 7+8*2, 7+8*3, 7+8*0, 7+8*1, 7+8*2, 7+8*3},
595
+                    {3+4*0, 3+4*1, 3+4*2, 3+4*3, 3+4*0, 3+4*1, 3+4*2, 3+4*3},
596
+                    {3+4*0, 3+4*1, 3+4*2, 3+4*3, 3+4*0, 3+4*1, 3+4*2, 3+4*3},
597 597
                 }
598 598
             };
599 599
             const uint8_t *off= offset[MB_FIELD][mb_y&1];
... ...
@@ -634,25 +662,29 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
634 634
         if(MB_FIELD){
635 635
             filter_mb_mbaff_edgev ( h, img_y                ,   linesize, bS  , 1, qp [0] );
636 636
             filter_mb_mbaff_edgev ( h, img_y  + 8*  linesize,   linesize, bS+4, 1, qp [1] );
637
-            filter_mb_mbaff_edgecv( h, img_cb,                uvlinesize, bS  , 1, bqp[0] );
638
-            filter_mb_mbaff_edgecv( h, img_cb + 4*uvlinesize, uvlinesize, bS+4, 1, bqp[1] );
639
-            filter_mb_mbaff_edgecv( h, img_cr,                uvlinesize, bS  , 1, rqp[0] );
640
-            filter_mb_mbaff_edgecv( h, img_cr + 4*uvlinesize, uvlinesize, bS+4, 1, rqp[1] );
637
+            if (chroma){
638
+                filter_mb_mbaff_edgecv( h, img_cb,                uvlinesize, bS  , 1, bqp[0] );
639
+                filter_mb_mbaff_edgecv( h, img_cb + 4*uvlinesize, uvlinesize, bS+4, 1, bqp[1] );
640
+                filter_mb_mbaff_edgecv( h, img_cr,                uvlinesize, bS  , 1, rqp[0] );
641
+                filter_mb_mbaff_edgecv( h, img_cr + 4*uvlinesize, uvlinesize, bS+4, 1, rqp[1] );
642
+            }
641 643
         }else{
642 644
             filter_mb_mbaff_edgev ( h, img_y              , 2*  linesize, bS  , 2, qp [0] );
643 645
             filter_mb_mbaff_edgev ( h, img_y  +   linesize, 2*  linesize, bS+1, 2, qp [1] );
644
-            filter_mb_mbaff_edgecv( h, img_cb,              2*uvlinesize, bS  , 2, bqp[0] );
645
-            filter_mb_mbaff_edgecv( h, img_cb + uvlinesize, 2*uvlinesize, bS+1, 2, bqp[1] );
646
-            filter_mb_mbaff_edgecv( h, img_cr,              2*uvlinesize, bS  , 2, rqp[0] );
647
-            filter_mb_mbaff_edgecv( h, img_cr + uvlinesize, 2*uvlinesize, bS+1, 2, rqp[1] );
646
+            if (chroma){
647
+                filter_mb_mbaff_edgecv( h, img_cb,              2*uvlinesize, bS  , 2, bqp[0] );
648
+                filter_mb_mbaff_edgecv( h, img_cb + uvlinesize, 2*uvlinesize, bS+1, 2, bqp[1] );
649
+                filter_mb_mbaff_edgecv( h, img_cr,              2*uvlinesize, bS  , 2, rqp[0] );
650
+                filter_mb_mbaff_edgecv( h, img_cr + uvlinesize, 2*uvlinesize, bS+1, 2, rqp[1] );
651
+            }
648 652
         }
649 653
     }
650 654
 
651 655
 #if CONFIG_SMALL
652 656
     for( dir = 0; dir < 2; dir++ )
653
-        filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
657
+        filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, chroma, CHROMA444, dir);
654 658
 #else
655
-    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
656
-    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
659
+    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, chroma, CHROMA444, 0);
660
+    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, chroma, CHROMA444, 1);
657 661
 #endif
658 662
 }
... ...
@@ -269,7 +269,7 @@ static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_s
269 269
         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
270 270
         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
271 271
         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
272
-        fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
272
+        fallback_sps ? sps->scaling_matrix8[3] : default_scaling8[1]
273 273
     };
274 274
     if(get_bits1(&s->gb)){
275 275
         sps->scaling_matrix_present |= is_sps;
... ...
@@ -281,7 +281,15 @@ static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_s
281 281
         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
282 282
         if(is_sps || pps->transform_8x8_mode){
283 283
             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
284
-            decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
284
+            if(h->sps.chroma_format_idc == 3){
285
+                decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[0],scaling_matrix8[0]);  // Intra, Cr
286
+                decode_scaling_list(h,scaling_matrix8[2],64,default_scaling8[0],scaling_matrix8[1]);  // Intra, Cb
287
+            }
288
+            decode_scaling_list(h,scaling_matrix8[3],64,default_scaling8[1],fallback[3]);  // Inter, Y
289
+            if(h->sps.chroma_format_idc == 3){
290
+                decode_scaling_list(h,scaling_matrix8[4],64,default_scaling8[1],scaling_matrix8[3]);  // Inter, Cr
291
+                decode_scaling_list(h,scaling_matrix8[5],64,default_scaling8[1],scaling_matrix8[4]);  // Inter, Cb
292
+            }
285 293
         }
286 294
     }
287 295
 }
... ...
@@ -395,7 +403,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){
395 395
         if(sps->crop_left || sps->crop_top){
396 396
             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
397 397
         }
398
-        if(sps->crop_right >= 8 || sps->crop_bottom >= 8){
398
+        if(sps->crop_right >= (8<<CHROMA444) || sps->crop_bottom >= (8<<CHROMA444)){
399 399
             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
400 400
         }
401 401
     }else{
... ...
@@ -66,10 +66,10 @@ typedef struct H264DSPContext{
66 66
     void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
67 67
     void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
68 68
 
69
-    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
70
-    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
71
-    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
72
-    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
69
+    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[15*8]);
70
+    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[15*8]);
71
+    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[15*8]);
72
+    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[15*8]);
73 73
     void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul);
74 74
     void (*h264_chroma_dc_dequant_idct)(DCTELEM *block, int qmul);
75 75
 }H264DSPContext;
... ...
@@ -30,15 +30,19 @@
30 30
 #ifndef AVCODEC_H264IDCT_INTERNAL_H
31 31
 #define AVCODEC_H264IDCT_INTERNAL_H
32 32
 //FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
33
-static const uint8_t scan8[16 + 2*4]={
34
- 4+1*8, 5+1*8, 4+2*8, 5+2*8,
35
- 6+1*8, 7+1*8, 6+2*8, 7+2*8,
36
- 4+3*8, 5+3*8, 4+4*8, 5+4*8,
37
- 6+3*8, 7+3*8, 6+4*8, 7+4*8,
38
- 1+1*8, 2+1*8,
39
- 1+2*8, 2+2*8,
40
- 1+4*8, 2+4*8,
41
- 1+5*8, 2+5*8,
33
+static const uint8_t scan8[16*3]={
34
+ 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
35
+ 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
36
+ 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
37
+ 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
38
+ 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
39
+ 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
40
+ 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
41
+ 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
42
+ 4+11*8, 5+11*8, 4+12*8, 5+12*8,
43
+ 6+11*8, 7+11*8, 6+12*8, 7+12*8,
44
+ 4+13*8, 5+13*8, 4+14*8, 5+14*8,
45
+ 6+13*8, 7+13*8, 6+14*8, 7+14*8
42 46
 };
43 47
 #endif
44 48
 
... ...
@@ -190,7 +194,7 @@ void FUNCC(ff_h264_idct8_dc_add)(uint8_t *p_dst, DCTELEM *block, int stride){
190 190
     }
191 191
 }
192 192
 
193
-void FUNCC(ff_h264_idct_add16)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
193
+void FUNCC(ff_h264_idct_add16)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
194 194
     int i;
195 195
     for(i=0; i<16; i++){
196 196
         int nnz = nnzc[ scan8[i] ];
... ...
@@ -201,7 +205,7 @@ void FUNCC(ff_h264_idct_add16)(uint8_t *dst, const int *block_offset, DCTELEM *b
201 201
     }
202 202
 }
203 203
 
204
-void FUNCC(ff_h264_idct_add16intra)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
204
+void FUNCC(ff_h264_idct_add16intra)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
205 205
     int i;
206 206
     for(i=0; i<16; i++){
207 207
         if(nnzc[ scan8[i] ])             FUNCC(idct_internal      )(dst + block_offset[i], block + i*16*sizeof(pixel), stride, 4, 6, 1);
... ...
@@ -209,7 +213,7 @@ void FUNCC(ff_h264_idct_add16intra)(uint8_t *dst, const int *block_offset, DCTEL
209 209
     }
210 210
 }
211 211
 
212
-void FUNCC(ff_h264_idct8_add4)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
212
+void FUNCC(ff_h264_idct8_add4)(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
213 213
     int i;
214 214
     for(i=0; i<16; i+=4){
215 215
         int nnz = nnzc[ scan8[i] ];
... ...
@@ -220,13 +224,15 @@ void FUNCC(ff_h264_idct8_add4)(uint8_t *dst, const int *block_offset, DCTELEM *b
220 220
     }
221 221
 }
222 222
 
223
-void FUNCC(ff_h264_idct_add8)(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
224
-    int i;
225
-    for(i=16; i<16+8; i++){
226
-        if(nnzc[ scan8[i] ])
227
-            FUNCC(ff_h264_idct_add   )(dest[(i&4)>>2] + block_offset[i], block + i*16*sizeof(pixel), stride);
228
-        else if(((dctcoef*)block)[i*16])
229
-            FUNCC(ff_h264_idct_dc_add)(dest[(i&4)>>2] + block_offset[i], block + i*16*sizeof(pixel), stride);
223
+void FUNCC(ff_h264_idct_add8)(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
224
+    int i, j;
225
+    for(j=1; j<3; j++){
226
+        for(i=j*16; i<j*16+4; i++){
227
+            if(nnzc[ scan8[i] ])
228
+                FUNCC(ff_h264_idct_add   )(dest[j-1] + block_offset[i], block + i*16*sizeof(pixel), stride);
229
+            else if(((dctcoef*)block)[i*16])
230
+                FUNCC(ff_h264_idct_dc_add)(dest[j-1] + block_offset[i], block + i*16*sizeof(pixel), stride);
231
+        }
230 232
     }
231 233
 }
232 234
 /**
... ...
@@ -1180,12 +1180,17 @@ void MPV_frame_end(MpegEncContext *s)
1180 1180
        && s->current_picture.reference
1181 1181
        && !s->intra_only
1182 1182
        && !(s->flags&CODEC_FLAG_EMU_EDGE)) {
1183
-        int edges = EDGE_BOTTOM | EDGE_TOP, h = s->v_edge_pos;
1184
-
1185
-            s->dsp.draw_edges(s->current_picture.data[0], s->linesize  , s->h_edge_pos   , h   , EDGE_WIDTH  , edges);
1186
-            s->dsp.draw_edges(s->current_picture.data[1], s->uvlinesize, s->h_edge_pos>>1, h>>1, EDGE_WIDTH/2, edges);
1187
-            s->dsp.draw_edges(s->current_picture.data[2], s->uvlinesize, s->h_edge_pos>>1, h>>1, EDGE_WIDTH/2, edges);
1188
-
1183
+            int hshift = av_pix_fmt_descriptors[s->avctx->pix_fmt].log2_chroma_w;
1184
+            int vshift = av_pix_fmt_descriptors[s->avctx->pix_fmt].log2_chroma_h;
1185
+            s->dsp.draw_edges(s->current_picture.data[0], s->linesize  ,
1186
+                              s->h_edge_pos             , s->v_edge_pos,
1187
+                              EDGE_WIDTH        , EDGE_WIDTH        , EDGE_TOP | EDGE_BOTTOM);
1188
+            s->dsp.draw_edges(s->current_picture.data[1], s->uvlinesize,
1189
+                              s->h_edge_pos>>hshift, s->v_edge_pos>>vshift,
1190
+                              EDGE_WIDTH>>hshift, EDGE_WIDTH>>vshift, EDGE_TOP | EDGE_BOTTOM);
1191
+            s->dsp.draw_edges(s->current_picture.data[2], s->uvlinesize,
1192
+                              s->h_edge_pos>>hshift, s->v_edge_pos>>vshift,
1193
+                              EDGE_WIDTH>>hshift, EDGE_WIDTH>>vshift, EDGE_TOP | EDGE_BOTTOM);
1189 1194
     }
1190 1195
 
1191 1196
     emms_c();
... ...
@@ -2289,14 +2294,19 @@ void ff_draw_horiz_band(MpegEncContext *s, int y, int h){
2289 2289
        && !s->intra_only
2290 2290
        && !(s->flags&CODEC_FLAG_EMU_EDGE)) {
2291 2291
         int sides = 0, edge_h;
2292
+        int hshift = av_pix_fmt_descriptors[s->avctx->pix_fmt].log2_chroma_w;
2293
+        int vshift = av_pix_fmt_descriptors[s->avctx->pix_fmt].log2_chroma_h;
2292 2294
         if (y==0) sides |= EDGE_TOP;
2293 2295
         if (y + h >= s->v_edge_pos) sides |= EDGE_BOTTOM;
2294 2296
 
2295 2297
         edge_h= FFMIN(h, s->v_edge_pos - y);
2296 2298
 
2297
-        s->dsp.draw_edges(s->current_picture_ptr->data[0] +  y    *s->linesize  , s->linesize  , s->h_edge_pos   , edge_h   , EDGE_WIDTH  , sides);
2298
-        s->dsp.draw_edges(s->current_picture_ptr->data[1] + (y>>1)*s->uvlinesize, s->uvlinesize, s->h_edge_pos>>1, edge_h>>1, EDGE_WIDTH/2, sides);
2299
-        s->dsp.draw_edges(s->current_picture_ptr->data[2] + (y>>1)*s->uvlinesize, s->uvlinesize, s->h_edge_pos>>1, edge_h>>1, EDGE_WIDTH/2, sides);
2299
+        s->dsp.draw_edges(s->current_picture_ptr->data[0] +  y         *s->linesize  , s->linesize,
2300
+                          s->h_edge_pos        , edge_h        , EDGE_WIDTH        , EDGE_WIDTH        , sides);
2301
+        s->dsp.draw_edges(s->current_picture_ptr->data[1] + (y>>vshift)*s->uvlinesize, s->uvlinesize,
2302
+                          s->h_edge_pos>>hshift, edge_h>>hshift, EDGE_WIDTH>>hshift, EDGE_WIDTH>>vshift, sides);
2303
+        s->dsp.draw_edges(s->current_picture_ptr->data[2] + (y>>vshift)*s->uvlinesize, s->uvlinesize,
2304
+                          s->h_edge_pos>>hshift, edge_h>>hshift, EDGE_WIDTH>>hshift, EDGE_WIDTH>>vshift, sides);
2300 2305
     }
2301 2306
 
2302 2307
     h= FFMIN(h, s->avctx->height - y);
... ...
@@ -527,7 +527,7 @@ static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int strid
527 527
     h264_idct_dc_add_internal(dst, block, stride, 8);
528 528
 }
529 529
 
530
-static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
530
+static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
531 531
     int i;
532 532
     for(i=0; i<16; i++){
533 533
         int nnz = nnzc[ scan8[i] ];
... ...
@@ -538,7 +538,7 @@ static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, DC
538 538
     }
539 539
 }
540 540
 
541
-static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
541
+static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
542 542
     int i;
543 543
     for(i=0; i<16; i++){
544 544
         if(nnzc[ scan8[i] ]) ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride);
... ...
@@ -546,7 +546,7 @@ static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offse
546 546
     }
547 547
 }
548 548
 
549
-static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
549
+static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
550 550
     int i;
551 551
     for(i=0; i<16; i+=4){
552 552
         int nnz = nnzc[ scan8[i] ];
... ...
@@ -557,13 +557,15 @@ static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DC
557 557
     }
558 558
 }
559 559
 
560
-static void ff_h264_idct_add8_altivec(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
561
-    int i;
562
-    for(i=16; i<16+8; i++){
563
-        if(nnzc[ scan8[i] ])
564
-            ff_h264_idct_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
565
-        else if(block[i*16])
566
-            h264_idct_dc_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
560
+static void ff_h264_idct_add8_altivec(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
561
+    int i, j;
562
+    for (j = 1; j < 3; j++) {
563
+        for(i = j * 16; i < j * 16 + 4; i++){
564
+            if(nnzc[ scan8[i] ])
565
+                ff_h264_idct_add_altivec(dest[j-1] + block_offset[i], block + i*16, stride);
566
+            else if(block[i*16])
567
+                h264_idct_dc_add_altivec(dest[j-1] + block_offset[i], block + i*16, stride);
568
+        }
567 569
     }
568 570
 }
569 571
 
... ...
@@ -1976,9 +1976,15 @@ static int frame_start(SnowContext *s){
1976 1976
    int h= s->avctx->height;
1977 1977
 
1978 1978
     if(s->current_picture.data[0]){
1979
-        s->dsp.draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  , EDGE_TOP|EDGE_BOTTOM);
1980
-        s->dsp.draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2, EDGE_TOP|EDGE_BOTTOM);
1981
-        s->dsp.draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2, EDGE_TOP|EDGE_BOTTOM);
1979
+        s->dsp.draw_edges(s->current_picture.data[0],
1980
+                          s->current_picture.linesize[0], w   , h   ,
1981
+                          EDGE_WIDTH  , EDGE_WIDTH  , EDGE_TOP | EDGE_BOTTOM);
1982
+        s->dsp.draw_edges(s->current_picture.data[1],
1983
+                          s->current_picture.linesize[1], w>>1, h>>1,
1984
+                          EDGE_WIDTH/2, EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
1985
+        s->dsp.draw_edges(s->current_picture.data[2],
1986
+                          s->current_picture.linesize[2], w>>1, h>>1,
1987
+                          EDGE_WIDTH/2, EDGE_WIDTH/2, EDGE_TOP | EDGE_BOTTOM);
1982 1988
     }
1983 1989
 
1984 1990
     release_buffer(s->avctx);
... ...
@@ -635,8 +635,9 @@ static int svq3_decode_mb(SVQ3Context *svq3, unsigned int mb_type)
635 635
         memset(h->intra4x4_pred_mode+h->mb2br_xy[mb_xy], DC_PRED, 8);
636 636
     }
637 637
     if (!IS_SKIP(mb_type) || s->pict_type == AV_PICTURE_TYPE_B) {
638
-        memset(h->non_zero_count_cache + 8, 0, 4*9*sizeof(uint8_t));
639
-        s->dsp.clear_blocks(h->mb);
638
+        memset(h->non_zero_count_cache + 8, 0, 14*8*sizeof(uint8_t));
639
+        s->dsp.clear_blocks(h->mb+  0);
640
+        s->dsp.clear_blocks(h->mb+384);
640 641
     }
641 642
 
642 643
     if (!IS_INTRA16x16(mb_type) && (!IS_SKIP(mb_type) || s->pict_type == AV_PICTURE_TYPE_B)) {
... ...
@@ -656,8 +657,8 @@ static int svq3_decode_mb(SVQ3Context *svq3, unsigned int mb_type)
656 656
         }
657 657
     }
658 658
     if (IS_INTRA16x16(mb_type)) {
659
-        AV_ZERO128(h->mb_luma_dc+0);
660
-        AV_ZERO128(h->mb_luma_dc+8);
659
+        AV_ZERO128(h->mb_luma_dc[0]+0);
660
+        AV_ZERO128(h->mb_luma_dc[0]+8);
661 661
         if (svq3_decode_block(&s->gb, h->mb_luma_dc, 0, 1)){
662 662
             av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding intra luma dc\n");
663 663
             return -1;
... ...
@@ -683,20 +684,23 @@ static int svq3_decode_mb(SVQ3Context *svq3, unsigned int mb_type)
683 683
         }
684 684
 
685 685
         if ((cbp & 0x30)) {
686
-            for (i = 0; i < 2; ++i) {
687
-              if (svq3_decode_block(&s->gb, &h->mb[16*(16 + 4*i)], 0, 3)){
686
+            for (i = 1; i < 3; ++i) {
687
+              if (svq3_decode_block(&s->gb, &h->mb[16*16*i], 0, 3)){
688 688
                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma dc block\n");
689 689
                 return -1;
690 690
               }
691 691
             }
692 692
 
693 693
             if ((cbp & 0x20)) {
694
-                for (i = 0; i < 8; i++) {
695
-                    h->non_zero_count_cache[ scan8[16+i] ] = 1;
696
-
697
-                    if (svq3_decode_block(&s->gb, &h->mb[16*(16 + i)], 1, 1)){
698
-                        av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma ac block\n");
699
-                        return -1;
694
+                for (i = 1; i < 3; i++) {
695
+                    for (j = 0; j < 4; j++) {
696
+                        k = 16*i + j;
697
+                        h->non_zero_count_cache[ scan8[k] ] = 1;
698
+
699
+                        if (svq3_decode_block(&s->gb, &h->mb[16*k], 1, 1)){
700
+                            av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma ac block\n");
701
+                            return -1;
702
+                        }
700 703
                     }
701 704
                 }
702 705
             }
... ...
@@ -762,7 +762,7 @@ static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
762 762
 
763 763
 /* draw the edges of width 'w' of an image of size width, height
764 764
    this mmx version can only handle w==8 || w==16 */
765
-static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int sides)
765
+static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
766 766
 {
767 767
     uint8_t *ptr, *last_line;
768 768
     int i;
... ...
@@ -817,7 +817,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w,
817 817
 
818 818
     /* top and bottom (and hopefully also the corners) */
819 819
     if (sides&EDGE_TOP) {
820
-        for(i = 0; i < w; i += 4) {
820
+        for(i = 0; i < h; i += 4) {
821 821
             ptr= buf - (i + 1) * wrap - w;
822 822
             __asm__ volatile(
823 823
                     "1:                             \n\t"
... ...
@@ -36,7 +36,7 @@
36 36
 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
37 37
 static int decode_significance_x86(CABACContext *c, int max_coeff,
38 38
                                    uint8_t *significant_coeff_ctx_base,
39
-                                   int *index){
39
+                                   int *index, x86_reg last_off){
40 40
     void *end= significant_coeff_ctx_base + max_coeff - 1;
41 41
     int minusstart= -(int)significant_coeff_ctx_base;
42 42
     int minusindex= 4-(int)index;
... ...
@@ -52,10 +52,12 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
52 52
 
53 53
         "test $1, %%edx                         \n\t"
54 54
         " jz 3f                                 \n\t"
55
+        "add  %7, %1                            \n\t"
55 56
 
56
-        BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx",
57
+        BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx",
57 58
                              "%%bx", "%%esi", "%%eax", "%%al")
58 59
 
60
+        "sub  %7, %1                            \n\t"
59 61
         "mov  %2, %%"REG_a"                     \n\t"
60 62
         "movl %4, %%ecx                         \n\t"
61 63
         "add  %1, %%"REG_c"                     \n\t"
... ...
@@ -82,7 +84,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
82 82
         "movl %%esi, "RANGE    "(%3)            \n\t"
83 83
         "movl %%ebx, "LOW      "(%3)            \n\t"
84 84
         :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)
85
-        :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)
85
+        :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off)
86 86
         : "%"REG_c, "%ebx", "%edx", "%esi", "memory"
87 87
     );
88 88
     return coeff_count;
... ...
@@ -90,7 +92,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
90 90
 
91 91
 static int decode_significance_8x8_x86(CABACContext *c,
92 92
                                        uint8_t *significant_coeff_ctx_base,
93
-                                       int *index, const uint8_t *sig_off){
93
+                                       int *index, x86_reg last_off, const uint8_t *sig_off){
94 94
     int minusindex= 4-(int)index;
95 95
     int coeff_count;
96 96
     x86_reg last=0;
... ...
@@ -114,8 +116,9 @@ static int decode_significance_8x8_x86(CABACContext *c,
114 114
 
115 115
         "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t"
116 116
         "add %5, %%"REG_D"                      \n\t"
117
+        "add %7, %%"REG_D"                      \n\t"
117 118
 
118
-        BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx",
119
+        BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx",
119 120
                              "%%bx", "%%esi", "%%eax", "%%al")
120 121
 
121 122
         "mov %2, %%"REG_a"                      \n\t"
... ...
@@ -142,7 +145,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
142 142
         "movl %%esi, "RANGE    "(%3)            \n\t"
143 143
         "movl %%ebx, "LOW      "(%3)            \n\t"
144 144
         :"=&a"(coeff_count),"+m"(last), "+m"(index)
145
-        :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off)
145
+        :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off), "m"(last_off)
146 146
         : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory"
147 147
     );
148 148
     return coeff_count;
... ...
@@ -32,14 +32,18 @@
32 32
 SECTION_RODATA
33 33
 
34 34
 ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
35
-scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
36
-           db 6+1*8, 7+1*8, 6+2*8, 7+2*8
37
-           db 4+3*8, 5+3*8, 4+4*8, 5+4*8
38
-           db 6+3*8, 7+3*8, 6+4*8, 7+4*8
39
-           db 1+1*8, 2+1*8
40
-           db 1+2*8, 2+2*8
41
-           db 1+4*8, 2+4*8
42
-           db 1+5*8, 2+5*8
35
+scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
36
+           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
37
+           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
38
+           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
39
+           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
40
+           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
41
+           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
42
+           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
43
+           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
44
+           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
45
+           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
46
+           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
43 47
 %ifdef PIC
44 48
 %define scan8 r11
45 49
 %else
... ...
@@ -617,6 +621,8 @@ cglobal h264_idct_add8_8_mmx, 5, 7, 0
617 617
     mov         r10, r0
618 618
 %endif
619 619
     call         h264_idct_add8_mmx_plane
620
+    mov          r5, 32
621
+    add          r2, 384
620 622
 %ifdef ARCH_X86_64
621 623
     add         r10, gprsize
622 624
 %else
... ...
@@ -678,6 +684,8 @@ cglobal h264_idct_add8_8_mmx2, 5, 7, 0
678 678
     lea         r11, [scan8_mem]
679 679
 %endif
680 680
     call h264_idct_add8_mmx2_plane
681
+    mov          r5, 32
682
+    add          r2, 384
681 683
 %ifdef ARCH_X86_64
682 684
     add         r10, gprsize
683 685
 %else
... ...
@@ -810,12 +818,12 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
810 810
     test        r0, r0
811 811
     jz .try%1dc
812 812
 %ifdef ARCH_X86_64
813
-    mov        r0d, dword [r1+%1*8+64]
813
+    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
814 814
     add         r0, [r10]
815 815
 %else
816 816
     mov         r0, r0m
817 817
     mov         r0, [r0]
818
-    add         r0, dword [r1+%1*8+64]
818
+    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
819 819
 %endif
820 820
     call        x264_add8x4_idct_sse2
821 821
     jmp .cycle%1end
... ...
@@ -824,16 +832,18 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
824 824
     or         r0w, word [r2+32]
825 825
     jz .cycle%1end
826 826
 %ifdef ARCH_X86_64
827
-    mov        r0d, dword [r1+%1*8+64]
827
+    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
828 828
     add         r0, [r10]
829 829
 %else
830 830
     mov         r0, r0m
831 831
     mov         r0, [r0]
832
-    add         r0, dword [r1+%1*8+64]
832
+    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
833 833
 %endif
834 834
     call        h264_idct_dc_add8_mmx2
835 835
 .cycle%1end
836
-%if %1 < 3
836
+%if %1 == 1
837
+    add         r2, 384+64
838
+%elif %1 < 3
837 839
     add         r2, 64
838 840
 %endif
839 841
 %endmacro
... ...
@@ -845,15 +855,15 @@ cglobal h264_idct_add8_8_sse2, 5, 7, 8
845 845
 %ifdef ARCH_X86_64
846 846
     mov         r10, r0
847 847
 %endif
848
-    add8_sse2_cycle 0, 0x09
849
-    add8_sse2_cycle 1, 0x11
848
+    add8_sse2_cycle 0, 0x34
849
+    add8_sse2_cycle 1, 0x3c
850 850
 %ifdef ARCH_X86_64
851 851
     add         r10, gprsize
852 852
 %else
853 853
     add        r0mp, gprsize
854 854
 %endif
855
-    add8_sse2_cycle 2, 0x21
856
-    add8_sse2_cycle 3, 0x29
855
+    add8_sse2_cycle 2, 0x5c
856
+    add8_sse2_cycle 3, 0x64
857 857
     RET
858 858
 
859 859
 ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
... ...
@@ -29,14 +29,18 @@ SECTION_RODATA
29 29
 
30 30
 pw_pixel_max: times 8 dw ((1 << 10)-1)
31 31
 pd_32:        times 4 dd 32
32
-scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
33
-           db 6+1*8, 7+1*8, 6+2*8, 7+2*8
34
-           db 4+3*8, 5+3*8, 4+4*8, 5+4*8
35
-           db 6+3*8, 7+3*8, 6+4*8, 7+4*8
36
-           db 1+1*8, 2+1*8
37
-           db 1+2*8, 2+2*8
38
-           db 1+4*8, 2+4*8
39
-           db 1+5*8, 2+5*8
32
+scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
33
+           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
34
+           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
35
+           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
36
+           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
37
+           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
38
+           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
39
+           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
40
+           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
41
+           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
42
+           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
43
+           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
40 44
 
41 45
 %ifdef PIC
42 46
 %define scan8 r11
... ...
@@ -306,7 +310,7 @@ INIT_AVX
306 306
 IDCT_ADD16INTRA_10 avx
307 307
 %endif
308 308
 
309
-%assign last_block 24
309
+%assign last_block 36
310 310
 ;-----------------------------------------------------------------------------
311 311
 ; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
312 312
 ;-----------------------------------------------------------------------------
... ...
@@ -317,21 +321,22 @@ cglobal h264_idct_add8_10_%1,5,7
317 317
 %endif
318 318
     add      r2, 1024
319 319
     mov      r0, [r0]
320
-    ADD16_OP_INTRA %1, 16, 1+1*8
321
-    ADD16_OP_INTRA %1, 18, 1+2*8
320
+    ADD16_OP_INTRA %1, 16, 4+ 6*8
321
+    ADD16_OP_INTRA %1, 18, 4+ 7*8
322
+    add      r2, 1024-128*2
322 323
 %ifdef ARCH_X86_64
323 324
     mov      r0, [r10+gprsize]
324 325
 %else
325 326
     mov      r0, r0m
326 327
     mov      r0, [r0+gprsize]
327 328
 %endif
328
-    ADD16_OP_INTRA %1, 20, 1+4*8
329
-    ADD16_OP_INTRA %1, 22, 1+5*8
329
+    ADD16_OP_INTRA %1, 32, 4+11*8
330
+    ADD16_OP_INTRA %1, 34, 4+12*8
330 331
     REP_RET
331 332
     AC %1, 16
332 333
     AC %1, 18
333
-    AC %1, 20
334
-    AC %1, 22
334
+    AC %1, 32
335
+    AC %1, 34
335 336
 
336 337
 %endmacro ; IDCT_ADD8
337 338
 
... ...
@@ -506,6 +506,13 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
506 506
         }
507 507
 }
508 508
 
509
+#define output_pixel(pos, val) \
510
+        if (target == PIX_FMT_GRAY16BE) { \
511
+            AV_WB16(pos, val); \
512
+        } else { \
513
+            AV_WL16(pos, val); \
514
+        }
515
+
509 516
 static av_always_inline void
510 517
 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
511 518
                         const int16_t **lumSrc, int lumFilterSize,
... ...
@@ -516,12 +523,6 @@ yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
516 516
 {
517 517
     int i;
518 518
 
519
-#define output_pixel(pos, val) \
520
-        if (target == PIX_FMT_GRAY16BE) { \
521
-            AV_WB16(pos, val); \
522
-        } else { \
523
-            AV_WL16(pos, val); \
524
-        }
525 519
     for (i = 0; i < (dstW >> 1); i++) {
526 520
         int j;
527 521
         int Y1 = 1 << 18;
... ...
@@ -583,10 +584,11 @@ yuv2gray16_1_c_template(SwsContext *c, const uint16_t *buf0,
583 583
         output_pixel(&dest[2 * i2 + 0], Y1);
584 584
         output_pixel(&dest[2 * i2 + 2], Y2);
585 585
     }
586
-#undef output_pixel
587 586
 }
588 587
 
589
-#define YUV2PACKEDWRAPPER(name, ext, fmt) \
588
+#undef output_pixel
589
+
590
+#define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
590 591
 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
591 592
                         const int16_t **lumSrc, int lumFilterSize, \
592 593
                         const int16_t *chrFilter, const int16_t **chrUSrc, \
... ...
@@ -594,7 +596,7 @@ static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
594 594
                         const int16_t **alpSrc, uint8_t *dest, int dstW, \
595 595
                         int y) \
596 596
 { \
597
-    name ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
597
+    name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
598 598
                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
599 599
                           alpSrc, dest, dstW, y, fmt); \
600 600
 } \
... ...
@@ -606,7 +608,7 @@ static void name ## ext ## _2_c(SwsContext *c, const uint16_t *buf0, \
606 606
                         const uint16_t *abuf1, uint8_t *dest, int dstW, \
607 607
                         int yalpha, int uvalpha, int y) \
608 608
 { \
609
-    name ## _2_c_template(c, buf0, buf1, ubuf0, ubuf1, \
609
+    name ## base ## _2_c_template(c, buf0, buf1, ubuf0, ubuf1, \
610 610
                           vbuf0, vbuf1, abuf0, abuf1, \
611 611
                           dest, dstW, yalpha, uvalpha, y, fmt); \
612 612
 } \
... ...
@@ -618,13 +620,20 @@ static void name ## ext ## _1_c(SwsContext *c, const uint16_t *buf0, \
618 618
                         int uvalpha, enum PixelFormat dstFormat, \
619 619
                         int flags, int y) \
620 620
 { \
621
-    name ## _1_c_template(c, buf0, ubuf0, ubuf1, vbuf0, \
621
+    name ## base ## _1_c_template(c, buf0, ubuf0, ubuf1, vbuf0, \
622 622
                           vbuf1, abuf0, dest, dstW, uvalpha, \
623 623
                           dstFormat, flags, y, fmt); \
624 624
 }
625 625
 
626
-YUV2PACKEDWRAPPER(yuv2gray16, LE, PIX_FMT_GRAY16LE);
627
-YUV2PACKEDWRAPPER(yuv2gray16, BE, PIX_FMT_GRAY16BE);
626
+YUV2PACKEDWRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
627
+YUV2PACKEDWRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
628
+
629
+#define output_pixel(pos, acc) \
630
+    if (target == PIX_FMT_MONOBLACK) { \
631
+        pos = acc; \
632
+    } else { \
633
+        pos = ~acc; \
634
+    }
628 635
 
629 636
 static av_always_inline void
630 637
 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
... ...
@@ -639,12 +648,6 @@ yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
639 639
     int i;
640 640
     int acc = 0;
641 641
 
642
-#define output_pixel(pos, acc) \
643
-    if (target == PIX_FMT_MONOBLACK) { \
644
-        pos = acc; \
645
-    } else { \
646
-        pos = ~acc; \
647
-    }
648 642
     for (i = 0; i < dstW - 1; i += 2) {
649 643
         int j;
650 644
         int Y1 = 1 << 18;
... ...
@@ -718,21 +721,12 @@ yuv2mono_1_c_template(SwsContext *c, const uint16_t *buf0,
718 718
         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
719 719
         output_pixel(*dest++, acc);
720 720
     }
721
-#undef output_pixel
722 721
 }
723 722
 
724
-YUV2PACKEDWRAPPER(yuv2mono, white, PIX_FMT_MONOWHITE);
725
-YUV2PACKEDWRAPPER(yuv2mono, black, PIX_FMT_MONOBLACK);
723
+#undef output_pixel
726 724
 
727
-static av_always_inline void
728
-yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
729
-                     const int16_t **lumSrc, int lumFilterSize,
730
-                     const int16_t *chrFilter, const int16_t **chrUSrc,
731
-                     const int16_t **chrVSrc, int chrFilterSize,
732
-                     const int16_t **alpSrc, uint8_t *dest, int dstW,
733
-                     int y, enum PixelFormat target)
734
-{
735
-    int i;
725
+YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
726
+YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
736 727
 
737 728
 #define output_pixels(pos, Y1, U, Y2, V) \
738 729
     if (target == PIX_FMT_YUYV422) { \
... ...
@@ -747,6 +741,16 @@ yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
747 747
         dest[pos + 3] = Y2; \
748 748
     }
749 749
 
750
+static av_always_inline void
751
+yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
752
+                     const int16_t **lumSrc, int lumFilterSize,
753
+                     const int16_t *chrFilter, const int16_t **chrUSrc,
754
+                     const int16_t **chrVSrc, int chrFilterSize,
755
+                     const int16_t **alpSrc, uint8_t *dest, int dstW,
756
+                     int y, enum PixelFormat target)
757
+{
758
+    int i;
759
+
750 760
     for (i = 0; i < (dstW >> 1); i++) {
751 761
         int j;
752 762
         int Y1 = 1 << 18;
... ...
@@ -828,11 +832,156 @@ yuv2422_1_c_template(SwsContext *c, const uint16_t *buf0,
828 828
             output_pixels(i * 4, Y1, U, Y2, V);
829 829
         }
830 830
     }
831
+}
832
+
831 833
 #undef output_pixels
834
+
835
+YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
836
+YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
837
+
838
+#define r_b ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? r : b)
839
+#define b_r ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? b : r)
840
+
841
+static av_always_inline void
842
+yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
843
+                       const int16_t **lumSrc, int lumFilterSize,
844
+                       const int16_t *chrFilter, const int16_t **chrUSrc,
845
+                       const int16_t **chrVSrc, int chrFilterSize,
846
+                       const int16_t **alpSrc, uint8_t *dest, int dstW,
847
+                       int y, enum PixelFormat target)
848
+{
849
+    int i;
850
+
851
+    for (i = 0; i < (dstW >> 1); i++) {
852
+        int j;
853
+        int Y1 = 1 << 18;
854
+        int Y2 = 1 << 18;
855
+        int U  = 1 << 18;
856
+        int V  = 1 << 18;
857
+        const uint8_t *r, *g, *b;
858
+
859
+        for (j = 0; j < lumFilterSize; j++) {
860
+            Y1 += lumSrc[j][i * 2]     * lumFilter[j];
861
+            Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
862
+        }
863
+        for (j = 0; j < chrFilterSize; j++) {
864
+            U += chrUSrc[j][i] * chrFilter[j];
865
+            V += chrVSrc[j][i] * chrFilter[j];
866
+        }
867
+        Y1 >>= 19;
868
+        Y2 >>= 19;
869
+        U  >>= 19;
870
+        V  >>= 19;
871
+        if ((Y1 | Y2 | U | V) & 0x100) {
872
+            Y1 = av_clip_uint8(Y1);
873
+            Y2 = av_clip_uint8(Y2);
874
+            U  = av_clip_uint8(U);
875
+            V  = av_clip_uint8(V);
876
+        }
877
+
878
+        /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
879
+        r = (const uint8_t *) c->table_rV[V];
880
+        g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]);
881
+        b = (const uint8_t *) c->table_bU[U];
882
+
883
+        dest[ 0] = dest[ 1] = r_b[Y1];
884
+        dest[ 2] = dest[ 3] =   g[Y1];
885
+        dest[ 4] = dest[ 5] = b_r[Y1];
886
+        dest[ 6] = dest[ 7] = r_b[Y2];
887
+        dest[ 8] = dest[ 9] =   g[Y2];
888
+        dest[10] = dest[11] = b_r[Y2];
889
+        dest += 12;
890
+    }
891
+}
892
+
893
+static av_always_inline void
894
+yuv2rgb48_2_c_template(SwsContext *c, const uint16_t *buf0,
895
+                       const uint16_t *buf1, const uint16_t *ubuf0,
896
+                       const uint16_t *ubuf1, const uint16_t *vbuf0,
897
+                       const uint16_t *vbuf1, const uint16_t *abuf0,
898
+                       const uint16_t *abuf1, uint8_t *dest, int dstW,
899
+                       int yalpha, int uvalpha, int y,
900
+                       enum PixelFormat target)
901
+{
902
+    int  yalpha1 = 4095 - yalpha;
903
+    int uvalpha1 = 4095 - uvalpha;
904
+    int i;
905
+
906
+    for (i = 0; i < (dstW >> 1); i++) {
907
+        int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
908
+        int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
909
+        int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
910
+        int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
911
+        const uint8_t *r = (const uint8_t *) c->table_rV[V],
912
+                      *g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]),
913
+                      *b = (const uint8_t *) c->table_bU[U];
914
+
915
+        dest[ 0] = dest[ 1] = r_b[Y1];
916
+        dest[ 2] = dest[ 3] =   g[Y1];
917
+        dest[ 4] = dest[ 5] = b_r[Y1];
918
+        dest[ 6] = dest[ 7] = r_b[Y2];
919
+        dest[ 8] = dest[ 9] =   g[Y2];
920
+        dest[10] = dest[11] = b_r[Y2];
921
+        dest += 12;
922
+    }
923
+}
924
+
925
+static av_always_inline void
926
+yuv2rgb48_1_c_template(SwsContext *c, const uint16_t *buf0,
927
+                       const uint16_t *ubuf0, const uint16_t *ubuf1,
928
+                       const uint16_t *vbuf0, const uint16_t *vbuf1,
929
+                       const uint16_t *abuf0, uint8_t *dest, int dstW,
930
+                       int uvalpha, enum PixelFormat dstFormat,
931
+                       int flags, int y, enum PixelFormat target)
932
+{
933
+    int i;
934
+
935
+    if (uvalpha < 2048) {
936
+        for (i = 0; i < (dstW >> 1); i++) {
937
+            int Y1 = buf0[i * 2]     >> 7;
938
+            int Y2 = buf0[i * 2 + 1] >> 7;
939
+            int U  = ubuf1[i]        >> 7;
940
+            int V  = vbuf1[i]        >> 7;
941
+            const uint8_t *r = (const uint8_t *) c->table_rV[V],
942
+                          *g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]),
943
+                          *b = (const uint8_t *) c->table_bU[U];
944
+
945
+            dest[ 0] = dest[ 1] = r_b[Y1];
946
+            dest[ 2] = dest[ 3] =   g[Y1];
947
+            dest[ 4] = dest[ 5] = b_r[Y1];
948
+            dest[ 6] = dest[ 7] = r_b[Y2];
949
+            dest[ 8] = dest[ 9] =   g[Y2];
950
+            dest[10] = dest[11] = b_r[Y2];
951
+            dest += 12;
952
+        }
953
+    } else {
954
+        for (i = 0; i < (dstW >> 1); i++) {
955
+            int Y1 =  buf0[i * 2]          >> 7;
956
+            int Y2 =  buf0[i * 2 + 1]      >> 7;
957
+            int U  = (ubuf0[i] + ubuf1[i]) >> 8;
958
+            int V  = (vbuf0[i] + vbuf1[i]) >> 8;
959
+            const uint8_t *r = (const uint8_t *) c->table_rV[V],
960
+                          *g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]),
961
+                          *b = (const uint8_t *) c->table_bU[U];
962
+
963
+            dest[ 0] = dest[ 1] = r_b[Y1];
964
+            dest[ 2] = dest[ 3] =   g[Y1];
965
+            dest[ 4] = dest[ 5] = b_r[Y1];
966
+            dest[ 6] = dest[ 7] = r_b[Y2];
967
+            dest[ 8] = dest[ 9] =   g[Y2];
968
+            dest[10] = dest[11] = b_r[Y2];
969
+            dest += 12;
970
+        }
971
+    }
832 972
 }
833 973
 
834
-YUV2PACKEDWRAPPER(yuv2422, yuyv, PIX_FMT_YUYV422);
835
-YUV2PACKEDWRAPPER(yuv2422, uyvy, PIX_FMT_UYVY422);
974
+#undef r_b
975
+#undef b_r
976
+
977
+YUV2PACKEDWRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
978
+//YUV2PACKEDWRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
979
+YUV2PACKEDWRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
980
+//YUV2PACKEDWRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
836 981
 
837 982
 #define YSCALE_YUV_2_RGBX_C(type,alpha) \
838 983
     for (i=0; i<(dstW>>1); i++) {\
... ...
@@ -974,36 +1123,6 @@ YUV2PACKEDWRAPPER(yuv2422, uyvy, PIX_FMT_UYVY422);
974 974
 
975 975
 #define YSCALE_YUV_2_ANYRGB_C(func)\
976 976
     switch(c->dstFormat) {\
977
-    case PIX_FMT_RGB48BE:\
978
-    case PIX_FMT_RGB48LE:\
979
-        func(uint8_t,0)\
980
-            ((uint8_t*)dest)[ 0]= r[Y1];\
981
-            ((uint8_t*)dest)[ 1]= r[Y1];\
982
-            ((uint8_t*)dest)[ 2]= g[Y1];\
983
-            ((uint8_t*)dest)[ 3]= g[Y1];\
984
-            ((uint8_t*)dest)[ 4]= b[Y1];\
985
-            ((uint8_t*)dest)[ 5]= b[Y1];\
986
-            ((uint8_t*)dest)[ 6]= r[Y2];\
987
-            ((uint8_t*)dest)[ 7]= r[Y2];\
988
-            ((uint8_t*)dest)[ 8]= g[Y2];\
989
-            ((uint8_t*)dest)[ 9]= g[Y2];\
990
-            ((uint8_t*)dest)[10]= b[Y2];\
991
-            ((uint8_t*)dest)[11]= b[Y2];\
992
-            dest+=12;\
993
-        }\
994
-        break;\
995
-    case PIX_FMT_BGR48BE:\
996
-    case PIX_FMT_BGR48LE:\
997
-        func(uint8_t,0)\
998
-            ((uint8_t*)dest)[ 0] = ((uint8_t*)dest)[ 1] = b[Y1];\
999
-            ((uint8_t*)dest)[ 2] = ((uint8_t*)dest)[ 3] = g[Y1];\
1000
-            ((uint8_t*)dest)[ 4] = ((uint8_t*)dest)[ 5] = r[Y1];\
1001
-            ((uint8_t*)dest)[ 6] = ((uint8_t*)dest)[ 7] = b[Y2];\
1002
-            ((uint8_t*)dest)[ 8] = ((uint8_t*)dest)[ 9] = g[Y2];\
1003
-            ((uint8_t*)dest)[10] = ((uint8_t*)dest)[11] = r[Y2];\
1004
-            dest+=12;\
1005
-        }\
1006
-        break;\
1007 977
     case PIX_FMT_RGBA:\
1008 978
     case PIX_FMT_BGRA:\
1009 979
         if (CONFIG_SMALL) {\
... ...
@@ -1294,19 +1413,21 @@ static av_always_inline void fillPlane(uint8_t* plane, int stride,
1294 1294
     }
1295 1295
 }
1296 1296
 
1297
+#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1298
+
1299
+#define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1300
+#define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1301
+
1297 1302
 static av_always_inline void
1298 1303
 rgb48ToY_c_template(int16_t *dst, const uint16_t *src, int width,
1299 1304
                     enum PixelFormat origin)
1300 1305
 {
1301 1306
     int i;
1302 1307
     for (i = 0; i < width; i++) {
1303
-#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1304
-        int a = input_pixel(&src[i*3+0]);
1305
-        int g = input_pixel(&src[i*3+1]);
1306
-        int c = input_pixel(&src[i*3+2]);
1308
+        int r_b = input_pixel(&src[i*3+0]);
1309
+        int   g = input_pixel(&src[i*3+1]);
1310
+        int b_r = input_pixel(&src[i*3+2]);
1307 1311
 
1308
-#define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? c : a)
1309
-#define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? a : c)
1310 1312
         dst[i] = (RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1+8)) + (1<<(RGB2YUV_SHIFT-7+8))) >> (RGB2YUV_SHIFT-6+8);
1311 1313
     }
1312 1314
 }
... ...
@@ -1319,9 +1440,9 @@ rgb48ToUV_c_template(int16_t *dstU, int16_t *dstV,
1319 1319
     int i;
1320 1320
     assert(src1==src2);
1321 1321
     for (i = 0; i < width; i++) {
1322
-        int a = input_pixel(&src1[3*i + 0]);
1323
-        int g = input_pixel(&src1[3*i + 1]);
1324
-        int c = input_pixel(&src1[3*i + 2]);
1322
+        int r_b = input_pixel(&src1[i*3+0]);
1323
+        int   g = input_pixel(&src1[i*3+1]);
1324
+        int b_r = input_pixel(&src1[i*3+2]);
1325 1325
 
1326 1326
         dstU[i] = (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1+8)) + (1<<(RGB2YUV_SHIFT-7+8))) >> (RGB2YUV_SHIFT-6+8);
1327 1327
         dstV[i] = (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1+8)) + (1<<(RGB2YUV_SHIFT-7+8))) >> (RGB2YUV_SHIFT-6+8);
... ...
@@ -1336,17 +1457,18 @@ rgb48ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
1336 1336
     int i;
1337 1337
     assert(src1==src2);
1338 1338
     for (i = 0; i < width; i++) {
1339
-        int a = (input_pixel(&src1[6*i + 0])) + (input_pixel(&src1[6*i + 3]));
1340
-        int g = (input_pixel(&src1[6*i + 1])) + (input_pixel(&src1[6*i + 4]));
1341
-        int c = (input_pixel(&src1[6*i + 2])) + (input_pixel(&src1[6*i + 5]));
1339
+        int r_b = (input_pixel(&src1[6*i + 0])) + (input_pixel(&src1[6*i + 3]));
1340
+        int   g = (input_pixel(&src1[6*i + 1])) + (input_pixel(&src1[6*i + 4]));
1341
+        int b_r = (input_pixel(&src1[6*i + 2])) + (input_pixel(&src1[6*i + 5]));
1342 1342
 
1343 1343
         dstU[i]= (RU*r + GU*g + BU*b + (256U<<(RGB2YUV_SHIFT+8)) + (1<<(RGB2YUV_SHIFT-6+8))) >> (RGB2YUV_SHIFT-5+8);
1344 1344
         dstV[i]= (RV*r + GV*g + BV*b + (256U<<(RGB2YUV_SHIFT+8)) + (1<<(RGB2YUV_SHIFT-6+8))) >> (RGB2YUV_SHIFT-5+8);
1345 1345
     }
1346
+}
1347
+
1346 1348
 #undef r
1347 1349
 #undef b
1348 1350
 #undef input_pixel
1349
-}
1350 1351
 
1351 1352
 #define rgb48funcs(pattern, BE_LE, origin) \
1352 1353
 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *dst, const uint8_t *src, \
... ...
@@ -1374,6 +1496,10 @@ rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1374 1374
 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1375 1375
 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1376 1376
 
1377
+#define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1378
+                         origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1379
+                        (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1380
+
1377 1381
 static av_always_inline void
1378 1382
 rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src,
1379 1383
                        int width, enum PixelFormat origin,
... ...
@@ -1386,9 +1512,6 @@ rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src,
1386 1386
     int i;
1387 1387
 
1388 1388
     for (i = 0; i < width; i++) {
1389
-#define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1390
-                         origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1391
-                        (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1392 1389
         int px = input_pixel(i) >> shp;
1393 1390
         int b = (px & maskb) >> shb;
1394 1391
         int g = (px & maskg) >> shg;
... ...
@@ -1454,9 +1577,10 @@ rgb16_32ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
1454 1454
         dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
1455 1455
         dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
1456 1456
     }
1457
-#undef input_pixel
1458 1457
 }
1459 1458
 
1459
+#undef input_pixel
1460
+
1460 1461
 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1461 1462
                          maskg, maskb, rsh, gsh, bsh, S) \
1462 1463
 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
... ...
@@ -1637,6 +1761,22 @@ static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1637 1637
     }
1638 1638
 }
1639 1639
 
1640
+static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1641
+                       const uint8_t *src1, const uint8_t *src2,
1642
+                       int width, uint32_t *unused)
1643
+{
1644
+    nvXXtoUV_c(dstU, dstV, src1, width);
1645
+}
1646
+
1647
+static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1648
+                       const uint8_t *src1, const uint8_t *src2,
1649
+                       int width, uint32_t *unused)
1650
+{
1651
+    nvXXtoUV_c(dstV, dstU, src1, width);
1652
+}
1653
+
1654
+#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1655
+
1640 1656
 // FIXME Maybe dither instead.
1641 1657
 static av_always_inline void
1642 1658
 yuv9_OR_10ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
... ...
@@ -1647,7 +1787,6 @@ yuv9_OR_10ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1647 1647
     const uint16_t *srcU = (const uint16_t *) _srcU;
1648 1648
     const uint16_t *srcV = (const uint16_t *) _srcV;
1649 1649
 
1650
-#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1651 1650
     for (i = 0; i < width; i++) {
1652 1651
         dstU[i] = input_pixel(&srcU[i]) >> (depth - 8);
1653 1652
         dstV[i] = input_pixel(&srcV[i]) >> (depth - 8);
... ...
@@ -1663,9 +1802,10 @@ yuv9_or_10ToY_c_template(uint8_t *dstY, const uint8_t *_srcY,
1663 1663
 
1664 1664
     for (i = 0; i < width; i++)
1665 1665
         dstY[i] = input_pixel(&srcY[i]) >> (depth - 8);
1666
-#undef input_pixel
1667 1666
 }
1668 1667
 
1668
+#undef input_pixel
1669
+
1669 1670
 #define YUV_NBPS(depth, BE_LE, origin) \
1670 1671
 static void BE_LE ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1671 1672
                                      const uint8_t *srcU, const uint8_t *srcV, \
... ...
@@ -1684,20 +1824,6 @@ YUV_NBPS( 9, BE, PIX_FMT_YUV420P9BE);
1684 1684
 YUV_NBPS(10, LE, PIX_FMT_YUV420P10LE);
1685 1685
 YUV_NBPS(10, BE, PIX_FMT_YUV420P10BE);
1686 1686
 
1687
-static inline void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1688
-                              const uint8_t *src1, const uint8_t *src2,
1689
-                              int width, uint32_t *unused)
1690
-{
1691
-    nvXXtoUV_c(dstU, dstV, src1, width);
1692
-}
1693
-
1694
-static inline void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1695
-                              const uint8_t *src1, const uint8_t *src2,
1696
-                              int width, uint32_t *unused)
1697
-{
1698
-    nvXXtoUV_c(dstV, dstU, src1, width);
1699
-}
1700
-
1701 1687
 static void bgr24ToY_c(int16_t *dst, const uint8_t *src,
1702 1688
                        int width, uint32_t *unused)
1703 1689
 {
... ...
@@ -2003,14 +2129,34 @@ find_c_packed_planar_out_funcs(SwsContext *c,
2003 2003
             *yuv2packedX = yuv2monoblack_X_c;
2004 2004
             break;
2005 2005
         case PIX_FMT_YUYV422:
2006
-            *yuv2packed1 = yuv2422yuyv_1_c;
2007
-            *yuv2packed2 = yuv2422yuyv_2_c;
2008
-            *yuv2packedX = yuv2422yuyv_X_c;
2006
+            *yuv2packed1 = yuv2yuyv422_1_c;
2007
+            *yuv2packed2 = yuv2yuyv422_2_c;
2008
+            *yuv2packedX = yuv2yuyv422_X_c;
2009 2009
             break;
2010 2010
         case PIX_FMT_UYVY422:
2011
-            *yuv2packed1 = yuv2422uyvy_1_c;
2012
-            *yuv2packed2 = yuv2422uyvy_2_c;
2013
-            *yuv2packedX = yuv2422uyvy_X_c;
2011
+            *yuv2packed1 = yuv2uyvy422_1_c;
2012
+            *yuv2packed2 = yuv2uyvy422_2_c;
2013
+            *yuv2packedX = yuv2uyvy422_X_c;
2014
+            break;
2015
+        case PIX_FMT_RGB48LE:
2016
+            //*yuv2packed1 = yuv2rgb48le_1_c;
2017
+            //*yuv2packed2 = yuv2rgb48le_2_c;
2018
+            //*yuv2packedX = yuv2rgb48le_X_c;
2019
+            //break;
2020
+        case PIX_FMT_RGB48BE:
2021
+            *yuv2packed1 = yuv2rgb48be_1_c;
2022
+            *yuv2packed2 = yuv2rgb48be_2_c;
2023
+            *yuv2packedX = yuv2rgb48be_X_c;
2024
+            break;
2025
+        case PIX_FMT_BGR48LE:
2026
+            //*yuv2packed1 = yuv2bgr48le_1_c;
2027
+            //*yuv2packed2 = yuv2bgr48le_2_c;
2028
+            //*yuv2packedX = yuv2bgr48le_X_c;
2029
+            //break;
2030
+        case PIX_FMT_BGR48BE:
2031
+            *yuv2packed1 = yuv2bgr48be_1_c;
2032
+            *yuv2packed2 = yuv2bgr48be_2_c;
2033
+            *yuv2packedX = yuv2bgr48be_X_c;
2014 2034
             break;
2015 2035
         default:
2016 2036
             *yuv2packed1 = yuv2packed1_c;