Add option for automatic subtitle character encoding normalization (#68)

* Add option for automatic subtitle character encoding normalization The rationale behind this function is that some services use ISO-8859-1 (latin1) or Windows-1252 (CP-1252) instead of UTF-8 encoding, whether intentionally or accidentally. Some services even stream subtitles with malformed/mixed encoding (each segment has a different encoding). * Remove Subtitle parameter `auto_fix_encoding` Just always attempt to fix encoding. If the subtitle is neither UTF-8 nor CP-1252, then it should realistically error out instead of producing garbage Subtitle data anyway. * Move Subtitle encoding fixing code out of if drm tree * Use chardet as a last ditch effort fixing Subs, or return original data * Move Subtitle.fix_encoding method to utilities as try_ensure_utf8 * Add Shivelight as a contributor --------- Co-authored-by: rlaphoenix <rlaphoenix@pm.me>
2023-12-02 19:00:55 +08:00
parent 4b8cfabaac
commit c31ee338dc
7 changed files with 58 additions and 6 deletions
--- a/poetry.lock
+++ b/poetry.lock
@@ -294,6 +294,17 @@ files = [
    {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
 ]

+[[package]]
+name = "chardet"
+version = "5.2.0"
+description = "Universal encoding detector for Python 3"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"},
+    {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"},
+]
+
 [[package]]
 name = "charset-normalizer"
 version = "3.2.0"
@@ -1769,4 +1780,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9.0,<3.12"
-content-hash = "d19aedf3a21dff6327497d42b56dbff63cef4d2899fa886c52c058e5439fee87"
+content-hash = "725552b13f9ba04c99b77a5cc96eef121abdd80eb5e67188981f6940fdc88015"