Browse code

lavc: support subtitles character encoding conversion.

Clément Bœsch authored on 2013/01/08 02:08:56
Showing 6 changed files
... ...
@@ -21,6 +21,7 @@ version <next>:
21 21
 - encrypted TTA stream decoding support
22 22
 - RF64 support in WAV muxer
23 23
 - noise filter ported from libmpcodecs
24
+- Subtitles character encoding conversion
24 25
 
25 26
 
26 27
 version 1.1:
... ...
@@ -1390,6 +1390,7 @@ HAVE_LIST="
1390 1390
     gnu_as
1391 1391
     gsm_h
1392 1392
     ibm_asm
1393
+    iconv
1393 1394
     inet_aton
1394 1395
     io_h
1395 1396
     isatty
... ...
@@ -3716,6 +3717,7 @@ check_func  getopt
3716 3716
 check_func  getrusage
3717 3717
 check_struct "sys/time.h sys/resource.h" "struct rusage" ru_maxrss
3718 3718
 check_func  gettimeofday
3719
+check_func  iconv
3719 3720
 check_func  inet_aton $network_extralibs
3720 3721
 check_func  isatty
3721 3722
 check_func  localtime_r
... ...
@@ -3208,6 +3208,24 @@ typedef struct AVCodecContext {
3208 3208
      * - encoding: unused
3209 3209
      */
3210 3210
     AVDictionary *metadata;
3211
+
3212
+    /**
3213
+     * Character encoding of the input subtitles file.
3214
+     * - decoding: set by user
3215
+     * - encoding: unused
3216
+     */
3217
+    char *sub_charenc;
3218
+
3219
+    /**
3220
+     * Subtitles character encoding mode. Formats or codecs might be adjusting
3221
+     * this setting (if they are doing the conversion themselves for instance).
3222
+     * - decoding: set by libavcodec
3223
+     * - encoding: unused
3224
+     */
3225
+    int sub_charenc_mode;
3226
+#define FF_SUB_CHARENC_MODE_DO_NOTHING  -1  ///< do nothing (demuxer outputs a stream supposed to be already in UTF-8, or the codec is bitmap for instance)
3227
+#define FF_SUB_CHARENC_MODE_AUTOMATIC    0  ///< libavcodec will select the mode itself
3228
+#define FF_SUB_CHARENC_MODE_PRE_DECODER  1  ///< the AVPacket data needs to be recoded to UTF-8 before being fed to the decoder, requires iconv
3211 3229
 } AVCodecContext;
3212 3230
 
3213 3231
 AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
... ...
@@ -406,6 +406,11 @@ static const AVOption options[]={
406 406
 {"ka", "Karaoke",            0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE },           INT_MIN, INT_MAX, A|E, "audio_service_type"},
407 407
 {"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, AV_SAMPLE_FMT_NB-1, A|D, "request_sample_fmt"},
408 408
 {"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
409
+{"sub_charenc", "set input text subtitles character encoding", OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
410
+{"sub_charenc_mode", "set input text subtitles character encoding mode", OFFSET(sub_charenc_mode), AV_OPT_TYPE_FLAGS, {.i64 = FF_SUB_CHARENC_MODE_AUTOMATIC}, -1, INT_MAX, S|D, "sub_charenc_mode"},
411
+{"do_nothing",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_DO_NOTHING},  INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
412
+{"auto",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_AUTOMATIC},   INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
413
+{"pre_decoder", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_PRE_DECODER}, INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
409 414
 {NULL},
410 415
 };
411 416
 
... ...
@@ -48,6 +48,9 @@
48 48
 #include <stdarg.h>
49 49
 #include <limits.h>
50 50
 #include <float.h>
51
+#if HAVE_ICONV
52
+# include <iconv.h>
53
+#endif
51 54
 
52 55
 volatile int ff_avcodec_locked;
53 56
 static int volatile entangled_thread_counter = 0;
... ...
@@ -1089,6 +1092,32 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
1089 1089
             ret = AVERROR(EINVAL);
1090 1090
             goto free_and_end;
1091 1091
         }
1092
+        if (avctx->sub_charenc) {
1093
+            if (avctx->codec_type != AVMEDIA_TYPE_SUBTITLE) {
1094
+                av_log(avctx, AV_LOG_ERROR, "Character encoding is only "
1095
+                       "supported with subtitles codecs\n");
1096
+                ret = AVERROR(EINVAL);
1097
+                goto free_and_end;
1098
+            } else if (avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB) {
1099
+                av_log(avctx, AV_LOG_WARNING, "Codec '%s' is bitmap-based, "
1100
+                       "subtitles character encoding will be ignored\n",
1101
+                       avctx->codec_descriptor->name);
1102
+                avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
1103
+            } else {
1104
+                /* input character encoding is set for a text based subtitle
1105
+                 * codec at this point */
1106
+                if (avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_AUTOMATIC)
1107
+                    avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_PRE_DECODER;
1108
+
1109
+                if (!HAVE_ICONV && avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_PRE_DECODER) {
1110
+                    av_log(avctx, AV_LOG_ERROR, "Character encoding subtitles "
1111
+                           "conversion needs a libavcodec built with iconv support "
1112
+                           "for this codec\n");
1113
+                    ret = AVERROR(ENOSYS);
1114
+                    goto free_and_end;
1115
+                }
1116
+            }
1117
+        }
1092 1118
     }
1093 1119
 end:
1094 1120
     ff_unlock_avcodec();
... ...
@@ -1847,6 +1876,68 @@ int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
1847 1847
     return ret;
1848 1848
 }
1849 1849
 
1850
+#define UTF8_MAX_BYTES 4 /* 5 and 6 bytes sequences should not be used */
1851
+static int recode_subtitle(AVCodecContext *avctx,
1852
+                           AVPacket *outpkt, const AVPacket *inpkt)
1853
+{
1854
+#if HAVE_ICONV
1855
+    iconv_t cd = (iconv_t)-1;
1856
+    int ret = 0;
1857
+    char *inb, *outb;
1858
+    size_t inl, outl;
1859
+    AVPacket tmp;
1860
+#endif
1861
+
1862
+    if (avctx->sub_charenc_mode != FF_SUB_CHARENC_MODE_PRE_DECODER)
1863
+        return 0;
1864
+
1865
+#if HAVE_ICONV
1866
+    cd = iconv_open("UTF-8", avctx->sub_charenc);
1867
+    if (cd == (iconv_t)-1) {
1868
+        av_log(avctx, AV_LOG_ERROR, "Unable to open iconv context "
1869
+               "with input character encoding \"%s\"\n", avctx->sub_charenc);
1870
+        ret = AVERROR(errno);
1871
+        goto end;
1872
+    }
1873
+
1874
+    inb = inpkt->data;
1875
+    inl = inpkt->size;
1876
+
1877
+    if (inl >= INT_MAX / UTF8_MAX_BYTES - FF_INPUT_BUFFER_PADDING_SIZE) {
1878
+        av_log(avctx, AV_LOG_ERROR, "Subtitles packet is too big for recoding\n");
1879
+        ret = AVERROR(ENOMEM);
1880
+        goto end;
1881
+    }
1882
+
1883
+    ret = av_new_packet(&tmp, inl * UTF8_MAX_BYTES);
1884
+    if (ret < 0)
1885
+        goto end;
1886
+    outpkt->data = tmp.data;
1887
+    outpkt->size = tmp.size;
1888
+    outb = outpkt->data;
1889
+    outl = outpkt->size;
1890
+
1891
+    if (iconv(cd, &inb, &inl, &outb, &outl) == (size_t)-1 ||
1892
+        iconv(cd, NULL, NULL, &outb, &outl) == (size_t)-1 ||
1893
+        outl >= outpkt->size || inl != 0) {
1894
+        av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
1895
+               "from %s to UTF-8\n", inpkt->data, avctx->sub_charenc);
1896
+        av_free_packet(&tmp);
1897
+        ret = AVERROR(errno);
1898
+        goto end;
1899
+    }
1900
+    outpkt->size -= outl;
1901
+    outpkt->data[outpkt->size - 1] = '\0';
1902
+
1903
+end:
1904
+    if (cd != (iconv_t)-1)
1905
+        iconv_close(cd);
1906
+    return ret;
1907
+#else
1908
+    av_assert0(!"requesting subtitles recoding without iconv");
1909
+#endif
1910
+}
1911
+
1850 1912
 int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
1851 1913
                              int *got_sub_ptr,
1852 1914
                              AVPacket *avpkt)
... ...
@@ -1862,19 +1953,28 @@ int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
1862 1862
     avcodec_get_subtitle_defaults(sub);
1863 1863
 
1864 1864
     if (avpkt->size) {
1865
+        AVPacket pkt_recoded;
1865 1866
         AVPacket tmp = *avpkt;
1866 1867
         int did_split = av_packet_split_side_data(&tmp);
1867 1868
         //apply_param_change(avctx, &tmp);
1868 1869
 
1869
-        avctx->pkt = &tmp;
1870
+        pkt_recoded = tmp;
1871
+        ret = recode_subtitle(avctx, &pkt_recoded, &tmp);
1872
+        if (ret < 0) {
1873
+            *got_sub_ptr = 0;
1874
+        } else {
1875
+        avctx->pkt = &pkt_recoded;
1870 1876
 
1871 1877
         if (avctx->pkt_timebase.den && avpkt->pts != AV_NOPTS_VALUE)
1872 1878
             sub->pts = av_rescale_q(avpkt->pts,
1873 1879
                                     avctx->pkt_timebase, AV_TIME_BASE_Q);
1874
-        ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &tmp);
1880
+        ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &pkt_recoded);
1881
+        if (tmp.data != pkt_recoded.data)
1882
+            av_free(pkt_recoded.data);
1875 1883
         sub->format = !(avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB);
1876
-
1877 1884
         avctx->pkt = NULL;
1885
+        }
1886
+
1878 1887
         if (did_split) {
1879 1888
             ff_packet_free_side_data(&tmp);
1880 1889
             if(ret == tmp.size)
... ...
@@ -29,8 +29,8 @@
29 29
 #include "libavutil/avutil.h"
30 30
 
31 31
 #define LIBAVCODEC_VERSION_MAJOR 54
32
-#define LIBAVCODEC_VERSION_MINOR 91
33
-#define LIBAVCODEC_VERSION_MICRO 103
32
+#define LIBAVCODEC_VERSION_MINOR 92
33
+#define LIBAVCODEC_VERSION_MICRO 100
34 34
 
35 35
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
36 36
                                                LIBAVCODEC_VERSION_MINOR, \