Add option for automatic subtitle character encoding normalization (#68)

* Add option for automatic subtitle character encoding normalization The rationale behind this function is that some services use ISO-8859-1 (latin1) or Windows-1252 (CP-1252) instead of UTF-8 encoding, whether intentionally or accidentally. Some services even stream subtitles with malformed/mixed encoding (each segment has a different encoding). * Remove Subtitle parameter `auto_fix_encoding` Just always attempt to fix encoding. If the subtitle is neither UTF-8 nor CP-1252, then it should realistically error out instead of producing garbage Subtitle data anyway. * Move Subtitle encoding fixing code out of if drm tree * Use chardet as a last ditch effort fixing Subs, or return original data * Move Subtitle.fix_encoding method to utilities as try_ensure_utf8 * Add Shivelight as a contributor --------- Co-authored-by: rlaphoenix <rlaphoenix@pm.me>
2023-12-02 19:00:55 +08:00
parent 4b8cfabaac
commit c31ee338dc
7 changed files with 58 additions and 6 deletions
--- a/devine/core/manifests/dash.py
+++ b/devine/core/manifests/dash.py
@@ -31,7 +31,7 @@ from devine.core.downloaders import downloader
 from devine.core.downloaders import requests as requests_downloader
 from devine.core.drm import Widevine
 from devine.core.tracks import Audio, Subtitle, Tracks, Video
-from devine.core.utilities import is_close_match
+from devine.core.utilities import is_close_match, try_ensure_utf8
 from devine.core.utils.xml import load_xml


@@ -471,7 +471,11 @@ class DASH:
                if init_data:
                    f.write(init_data)
                for segment_file in sorted(save_dir.iterdir()):
-                    f.write(segment_file.read_bytes())
+                    segment_data = segment_file.read_bytes()
+                    # TODO: fix encoding after decryption?
+                    if not drm and isinstance(track, Subtitle):
+                        segment_data = try_ensure_utf8(segment_data)
+                    f.write(segment_data)
                    segment_file.unlink()

            if drm:
--- a/devine/core/manifests/hls.py
+++ b/devine/core/manifests/hls.py
@@ -28,7 +28,7 @@ from devine.core.downloaders import downloader
 from devine.core.downloaders import requests as requests_downloader
 from devine.core.drm import DRM_T, ClearKey, Widevine
 from devine.core.tracks import Audio, Subtitle, Tracks, Video
-from devine.core.utilities import is_close_match
+from devine.core.utilities import is_close_match, try_ensure_utf8


 class HLS:
@@ -301,7 +301,10 @@ class HLS:

        with open(save_path, "wb") as f:
            for segment_file in sorted(save_dir.iterdir()):
-                f.write(segment_file.read_bytes())
+                segment_data = segment_file.read_bytes()
+                if isinstance(track, Subtitle):
+                    segment_data = try_ensure_utf8(segment_data)
+                f.write(segment_data)
                segment_file.unlink()

        progress(downloaded="Downloaded")
--- a/devine/core/utilities.py
+++ b/devine/core/utilities.py
@@ -13,6 +13,7 @@ from types import ModuleType
 from typing import AsyncIterator, Optional, Sequence, Union
 from urllib.parse import urlparse

+import chardet
 import pproxy
 import requests
 from construct import ValidationError
@@ -215,6 +216,32 @@ def time_elapsed_since(start: float) -> str:
    return time_string


+def try_ensure_utf8(data: bytes) -> bytes:
+    """
+    Try to ensure that the given data is encoded in UTF-8.
+
+    Parameters:
+        data: Input data that may or may not yet be UTF-8 or another encoding.
+
+    Returns the input data encoded in UTF-8 if successful. If unable to detect the
+    encoding of the input data, then the original data is returned as-received.
+    """
+    try:
+        data.decode("utf8")
+        return data
+    except UnicodeDecodeError:
+        try:
+            # CP-1252 is a superset of latin1
+            return data.decode("cp1252").encode("utf8")
+        except UnicodeDecodeError:
+            try:
+                # last ditch effort to detect encoding
+                detection_result = chardet.detect(data)
+                return data.decode(detection_result["encoding"]).encode("utf8")
+            except UnicodeDecodeError:
+                return data
+
+
@contextlib.asynccontextmanager
 async def start_pproxy(proxy: str) -> AsyncIterator[str]:
    proxy = urlparse(proxy)