diff --git a/pkgs/flatpak/com.github.rafostar.Clapper.json b/pkgs/flatpak/com.github.rafostar.Clapper.json index efed01fd..668dc18e 100644 --- a/pkgs/flatpak/com.github.rafostar.Clapper.json +++ b/pkgs/flatpak/com.github.rafostar.Clapper.json @@ -30,6 +30,7 @@ "lib/libdvdnav.json", "lib/libass.json", "lib/ffmpeg.json", + "lib/uchardet.json", "gstreamer-1.0/gstreamer.json", "gstreamer-1.0/gst-plugins-base.json", "gstreamer-1.0/gst-plugins-good.json", diff --git a/pkgs/flatpak/gstreamer-1.0/gst-plugins-base-autodetect-subtitle-text-encoding.patch b/pkgs/flatpak/gstreamer-1.0/gst-plugins-base-autodetect-subtitle-text-encoding.patch new file mode 100644 index 00000000..6d3422e2 --- /dev/null +++ b/pkgs/flatpak/gstreamer-1.0/gst-plugins-base-autodetect-subtitle-text-encoding.patch @@ -0,0 +1,142 @@ +From 61a66babede5a587783a1d4eb28e950a755ff362 Mon Sep 17 00:00:00 2001 +From: Rafostar +Date: Wed, 25 Nov 2020 14:44:21 +0100 +Subject: [PATCH] subparse: Autodetect subtitle text encoding + +Use "uchardet" to guess the subtitle text encoding if it is not in UTF-8 +or manually specified instead of blindly guessing its "ISO-8859-15". +The "uchardet" dependency is optional and when is not available at +compile time, then old behaviour will be used. +--- + gst/subparse/gstsubparse.c | 58 +++++++++++++++++++++++++++++++++----- + gst/subparse/meson.build | 12 ++++++-- + 2 files changed, 61 insertions(+), 9 deletions(-) + +diff --git a/gst/subparse/gstsubparse.c b/gst/subparse/gstsubparse.c +index 382e430f2..42283d2d1 100644 +--- a/gst/subparse/gstsubparse.c ++++ b/gst/subparse/gstsubparse.c +@@ -31,6 +31,10 @@ + #include + #include + ++#if defined(HAVE_UCHARDET) ++#include ++#endif ++ + #include "gstsubparse.h" + #include "gstssaparse.h" + #include "samiparse.h" +@@ -148,8 +152,9 @@ gst_sub_parse_class_init (GstSubParseClass * klass) + "Encoding to assume if input subtitles are not in UTF-8 or any other " + "Unicode encoding. If not set, the GST_SUBTITLE_ENCODING environment " + "variable will be checked for an encoding to use. If that is not set " +- "either, ISO-8859-15 will be assumed.", DEFAULT_ENCODING, +- G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)); ++ "either, then if plugin was build with uchardet support it will be " ++ "used to guess the encoding, otherwise ISO-8859-15 will be assumed.", ++ DEFAULT_ENCODING, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)); + + g_object_class_install_property (object_class, PROP_VIDEOFPS, + gst_param_spec_fraction ("video-fps", "Video framerate", +@@ -439,6 +444,35 @@ detect_encoding (const gchar * str, gsize len) + return NULL; + } + ++static gchar * ++uchardet_detect_encoding (const gchar * str, gsize len) ++{ ++ gchar *charset = NULL; ++ gint retval; ++ ++#if defined(HAVE_UCHARDET) ++ uchardet_t handle = uchardet_new (); ++ retval = uchardet_handle_data (handle, str, len); ++ ++ GST_DEBUG ("detecting encoding with uchardet using %li characters", len); ++ ++ if (retval != 0) { ++ GST_WARNING ("could not handle data with uchardet"); ++ } else { ++ uchardet_data_end (handle); ++ charset = g_strdup (uchardet_get_charset (handle)); ++ ++ if (charset == NULL || *charset == '\0') ++ GST_WARNING ("uchardet could not detect encoding"); ++ else ++ GST_INFO ("uchardet detected encoding: %s", charset); ++ } ++ uchardet_delete (handle); ++#endif ++ ++ return charset; ++} ++ + static gchar * + convert_encoding (GstSubParse * self, const gchar * str, gsize len, + gsize * consumed) +@@ -481,11 +515,18 @@ convert_encoding (GstSubParse * self, const gchar * str, gsize len, + encoding = g_getenv ("GST_SUBTITLE_ENCODING"); + } + if (encoding == NULL || *encoding == '\0') { +- /* if local encoding is UTF-8 and no encoding specified +- * via the environment variable, assume ISO-8859-15 */ +- if (g_get_charset (&encoding)) { ++ /* no encoding specified via the environment variable either, ++ * so try to autodetect with uchardet */ ++ encoding = uchardet_detect_encoding (str, len); ++ } ++ ++ /* if uchardet failed and local encoding is UTF-8, assume ISO-8859-15 */ ++ if (encoding == NULL || *encoding == '\0') { ++ if (g_get_charset (&encoding)) + encoding = "ISO-8859-15"; +- } ++ } else { ++ /* reuse the detected encoding from now on */ ++ self->detected_encoding = g_strdup (encoding); + } + + ret = gst_convert_to_utf8 (str, len, encoding, consumed, &err); +@@ -2159,7 +2200,10 @@ gst_subparse_type_find (GstTypeFind * tf, gpointer private) + enc = g_getenv ("GST_SUBTITLE_ENCODING"); + if (enc == NULL || *enc == '\0') { + /* if local encoding is UTF-8 and no encoding specified +- * via the environment variable, assume ISO-8859-15 */ ++ * via the environment variable, assume ISO-8859-15 ++ * ++ * Encoding here is only used for type find, so no need ++ * to run through uchardet at this point */ + if (g_get_charset (&enc)) { + enc = "ISO-8859-15"; + } +diff --git a/gst/subparse/meson.build b/gst/subparse/meson.build +index 9a76601f0..2dcf8830f 100644 +--- a/gst/subparse/meson.build ++++ b/gst/subparse/meson.build +@@ -6,12 +6,20 @@ subparse_sources = [ + 'mpl2parse.c', + 'qttextparse.c', + ] ++subparse_defines = [] ++subparse_optional_deps = [] ++ ++subparse_uchardet_dep = dependency('uchardet', required : false) ++if subparse_uchardet_dep.found() ++ subparse_defines += '-DHAVE_UCHARDET' ++ subparse_optional_deps += subparse_uchardet_dep ++endif + + gstsubparse = library('gstsubparse', + subparse_sources, +- c_args : gst_plugins_base_args, ++ c_args : gst_plugins_base_args + subparse_defines, + include_directories: [configinc, libsinc], +- dependencies : [gst_base_dep], ++ dependencies : [gst_base_dep] + subparse_optional_deps, + install : true, + install_dir : plugins_install_dir, + ) +-- +2.26.2 + diff --git a/pkgs/flatpak/gstreamer-1.0/gst-plugins-base.json b/pkgs/flatpak/gstreamer-1.0/gst-plugins-base.json index 5a81bae4..bcf8b6af 100644 --- a/pkgs/flatpak/gstreamer-1.0/gst-plugins-base.json +++ b/pkgs/flatpak/gstreamer-1.0/gst-plugins-base.json @@ -21,6 +21,10 @@ "url": "https://gitlab.freedesktop.org/gstreamer/gst-plugins-base.git", "tag": "1.18.1", "commit": "4013b8003e78971dd01b055066c12f8aaadb8897" + }, + { + "type": "patch", + "path": "gst-plugins-base-autodetect-subtitle-text-encoding.patch" } ] } diff --git a/pkgs/flatpak/lib/uchardet.json b/pkgs/flatpak/lib/uchardet.json new file mode 100644 index 00000000..3136402f --- /dev/null +++ b/pkgs/flatpak/lib/uchardet.json @@ -0,0 +1,14 @@ +{ + "name": "uchardet", + "buildsystem": "cmake", + "builddir": true, + "config-opts": [ "-DCMAKE_INSTALL_LIBDIR=lib" ], + "sources": [ + { + "type": "git", + "url": "https://gitlab.freedesktop.org/uchardet/uchardet.git", + "tag": "v0.0.7", + "commit": "59f68dbe5709d708b53ad5ea95c7349d7ee6ebe4" + } + ] +}