Initial commit
This commit is contained in:
399
devine/core/tracks/subtitle.py
Normal file
399
devine/core/tracks/subtitle.py
Normal file
@@ -0,0 +1,399 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from collections import defaultdict
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, Optional
|
||||
|
||||
import pycaption
|
||||
from construct import Container
|
||||
from pycaption import Caption, CaptionList, CaptionNode, WebVTTReader
|
||||
from pycaption.geometry import Layout
|
||||
from pymp4.parser import MP4
|
||||
from subtitle_filter import Subtitles
|
||||
|
||||
from devine.core.tracks.track import Track
|
||||
from devine.core.utilities import get_binary_path
|
||||
|
||||
|
||||
class Subtitle(Track):
|
||||
class Codec(str, Enum):
|
||||
SubRip = "SRT" # https://wikipedia.org/wiki/SubRip
|
||||
SubStationAlpha = "SSA" # https://wikipedia.org/wiki/SubStation_Alpha
|
||||
SubStationAlphav4 = "ASS" # https://wikipedia.org/wiki/SubStation_Alpha#Advanced_SubStation_Alpha=
|
||||
TimedTextMarkupLang = "TTML" # https://wikipedia.org/wiki/Timed_Text_Markup_Language
|
||||
WebVTT = "VTT" # https://wikipedia.org/wiki/WebVTT
|
||||
# MPEG-DASH box-encapsulated subtitle formats
|
||||
fTTML = "STPP" # https://www.w3.org/TR/2018/REC-ttml-imsc1.0.1-20180424
|
||||
fVTT = "WVTT" # https://www.w3.org/TR/webvtt1
|
||||
|
||||
@property
|
||||
def extension(self) -> str:
|
||||
return self.value.lower()
|
||||
|
||||
@staticmethod
|
||||
def from_mime(mime: str) -> Subtitle.Codec:
|
||||
mime = mime.lower().strip().split(".")[0]
|
||||
if mime == "srt":
|
||||
return Subtitle.Codec.SubRip
|
||||
elif mime == "ssa":
|
||||
return Subtitle.Codec.SubStationAlpha
|
||||
elif mime == "ass":
|
||||
return Subtitle.Codec.SubStationAlphav4
|
||||
elif mime == "ttml":
|
||||
return Subtitle.Codec.TimedTextMarkupLang
|
||||
elif mime == "vtt":
|
||||
return Subtitle.Codec.WebVTT
|
||||
elif mime == "stpp":
|
||||
return Subtitle.Codec.fTTML
|
||||
elif mime == "wvtt":
|
||||
return Subtitle.Codec.fVTT
|
||||
raise ValueError(f"The MIME '{mime}' is not a supported Subtitle Codec")
|
||||
|
||||
@staticmethod
|
||||
def from_codecs(codecs: str) -> Subtitle.Codec:
|
||||
for codec in codecs.lower().split(","):
|
||||
mime = codec.strip().split(".")[0]
|
||||
try:
|
||||
return Subtitle.Codec.from_mime(mime)
|
||||
except ValueError:
|
||||
pass
|
||||
raise ValueError(f"No MIME types matched any supported Subtitle Codecs in '{codecs}'")
|
||||
|
||||
@staticmethod
|
||||
def from_netflix_profile(profile: str) -> Subtitle.Codec:
|
||||
profile = profile.lower().strip()
|
||||
if profile.startswith("webvtt"):
|
||||
return Subtitle.Codec.WebVTT
|
||||
if profile.startswith("dfxp"):
|
||||
return Subtitle.Codec.TimedTextMarkupLang
|
||||
raise ValueError(f"The Content Profile '{profile}' is not a supported Subtitle Codec")
|
||||
|
||||
def __init__(self, *args: Any, codec: Subtitle.Codec, cc: bool = False, sdh: bool = False, forced: bool = False,
|
||||
**kwargs: Any):
|
||||
"""
|
||||
Information on Subtitle Types:
|
||||
https://bit.ly/2Oe4fLC (3PlayMedia Blog on SUB vs CC vs SDH).
|
||||
However, I wouldn't pay much attention to the claims about SDH needing to
|
||||
be in the original source language. It's logically not true.
|
||||
|
||||
CC == Closed Captions. Source: Basically every site.
|
||||
SDH = Subtitles for the Deaf or Hard-of-Hearing. Source: Basically every site.
|
||||
HOH = Exact same as SDH. Is a term used in the UK. Source: https://bit.ly/2PGJatz (ICO UK)
|
||||
|
||||
More in-depth information, examples, and stuff to look for can be found in the Parameter
|
||||
explanation list below.
|
||||
|
||||
Parameters:
|
||||
cc: Closed Caption.
|
||||
- Intended as if you couldn't hear the audio at all.
|
||||
- Can have Sound as well as Dialogue, but doesn't have to.
|
||||
- Original source would be from an EIA-CC encoded stream. Typically all
|
||||
upper-case characters.
|
||||
Indicators of it being CC without knowing original source:
|
||||
- Extracted with CCExtractor, or
|
||||
- >>> (or similar) being used at the start of some or all lines, or
|
||||
- All text is uppercase or at least the majority, or
|
||||
- Subtitles are Scrolling-text style (one line appears, oldest line
|
||||
then disappears).
|
||||
Just because you downloaded it as a SRT or VTT or such, doesn't mean it
|
||||
isn't from an EIA-CC stream. And I wouldn't take the streaming services
|
||||
(CC) as gospel either as they tend to get it wrong too.
|
||||
sdh: Deaf or Hard-of-Hearing. Also known as HOH in the UK (EU?).
|
||||
- Intended as if you couldn't hear the audio at all.
|
||||
- MUST have Sound as well as Dialogue to be considered SDH.
|
||||
- It has no "syntax" or "format" but is not transmitted using archaic
|
||||
forms like EIA-CC streams, would be intended for transmission via
|
||||
SubRip (SRT), WebVTT (VTT), TTML, etc.
|
||||
If you can see important audio/sound transcriptions and not just dialogue
|
||||
and it doesn't have the indicators of CC, then it's most likely SDH.
|
||||
If it doesn't have important audio/sounds transcriptions it might just be
|
||||
regular subtitling (you wouldn't mark as CC or SDH). This would be the
|
||||
case for most translation subtitles. Like Anime for example.
|
||||
forced: Typically used if there's important information at some point in time
|
||||
like watching Dubbed content and an important Sign or Letter is shown
|
||||
or someone talking in a different language.
|
||||
Forced tracks are recommended by the Matroska Spec to be played if
|
||||
the player's current playback audio language matches a subtitle
|
||||
marked as "forced".
|
||||
However, that doesn't mean every player works like this but there is
|
||||
no other way to reliably work with Forced subtitles where multiple
|
||||
forced subtitles may be in the output file. Just know what to expect
|
||||
with "forced" subtitles.
|
||||
"""
|
||||
super().__init__(*args, **kwargs)
|
||||
self.codec = codec
|
||||
self.cc = bool(cc)
|
||||
self.sdh = bool(sdh)
|
||||
if self.cc and self.sdh:
|
||||
raise ValueError("A text track cannot be both CC and SDH.")
|
||||
self.forced = bool(forced)
|
||||
if (self.cc or self.sdh) and self.forced:
|
||||
raise ValueError("A text track cannot be CC/SDH as well as Forced.")
|
||||
|
||||
def get_track_name(self) -> Optional[str]:
|
||||
"""Return the base Track Name."""
|
||||
track_name = super().get_track_name() or ""
|
||||
flag = self.cc and "CC" or self.sdh and "SDH" or self.forced and "Forced"
|
||||
if flag:
|
||||
if track_name:
|
||||
flag = f" ({flag})"
|
||||
track_name += flag
|
||||
return track_name or None
|
||||
|
||||
@staticmethod
|
||||
def parse(data: bytes, codec: Subtitle.Codec) -> pycaption.CaptionSet:
|
||||
# TODO: Use an "enum" for subtitle codecs
|
||||
if not isinstance(data, bytes):
|
||||
raise ValueError(f"Subtitle data must be parsed as bytes data, not {type(data).__name__}")
|
||||
try:
|
||||
if codec == Subtitle.Codec.fTTML:
|
||||
captions: dict[str, pycaption.CaptionList] = defaultdict(pycaption.CaptionList)
|
||||
for segment in (
|
||||
Subtitle.parse(box.data, Subtitle.Codec.TimedTextMarkupLang)
|
||||
for box in MP4.parse_stream(BytesIO(data))
|
||||
if box.type == b"mdat"
|
||||
):
|
||||
for lang in segment.get_languages():
|
||||
captions[lang].extend(segment.get_captions(lang))
|
||||
captions: pycaption.CaptionSet = pycaption.CaptionSet(captions)
|
||||
return captions
|
||||
if codec == Subtitle.Codec.TimedTextMarkupLang:
|
||||
text = data.decode("utf8").replace("tt:", "")
|
||||
return pycaption.DFXPReader().read(text)
|
||||
if codec == Subtitle.Codec.fVTT:
|
||||
caption_lists: dict[str, pycaption.CaptionList] = defaultdict(pycaption.CaptionList)
|
||||
caption_list, language = Subtitle.merge_segmented_wvtt(data)
|
||||
caption_lists[language] = caption_list
|
||||
caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists)
|
||||
return caption_set
|
||||
if codec == Subtitle.Codec.WebVTT:
|
||||
text = data.decode("utf8").replace("\r", "").replace("\n\n\n", "\n \n\n").replace("\n\n<", "\n<")
|
||||
captions: pycaption.CaptionSet = pycaption.WebVTTReader().read(text)
|
||||
return captions
|
||||
except pycaption.exceptions.CaptionReadSyntaxError:
|
||||
raise SyntaxError(f"A syntax error has occurred when reading the \"{codec}\" subtitle")
|
||||
except pycaption.exceptions.CaptionReadNoCaptions:
|
||||
return pycaption.CaptionSet({"en": []})
|
||||
|
||||
raise ValueError(f"Unknown Subtitle Format \"{codec}\"...")
|
||||
|
||||
@staticmethod
|
||||
def merge_same_cues(caption_set: pycaption.CaptionSet):
|
||||
"""Merge captions with the same timecodes and text as one in-place."""
|
||||
for lang in caption_set.get_languages():
|
||||
captions = caption_set.get_captions(lang)
|
||||
last_caption = None
|
||||
concurrent_captions = pycaption.CaptionList()
|
||||
merged_captions = pycaption.CaptionList()
|
||||
for caption in captions:
|
||||
if last_caption:
|
||||
if (caption.start, caption.end) == (last_caption.start, last_caption.end):
|
||||
if caption.get_text() != last_caption.get_text():
|
||||
concurrent_captions.append(caption)
|
||||
last_caption = caption
|
||||
continue
|
||||
else:
|
||||
merged_captions.append(pycaption.base.merge(concurrent_captions))
|
||||
concurrent_captions = [caption]
|
||||
last_caption = caption
|
||||
|
||||
if concurrent_captions:
|
||||
merged_captions.append(pycaption.base.merge(concurrent_captions))
|
||||
if merged_captions:
|
||||
caption_set.set_captions(lang, merged_captions)
|
||||
|
||||
@staticmethod
|
||||
def merge_segmented_wvtt(data: bytes, period_start: float = 0.) -> tuple[CaptionList, Optional[str]]:
|
||||
"""
|
||||
Convert Segmented DASH WebVTT cues into a pycaption Caption List.
|
||||
Also returns an ISO 639-2 alpha-3 language code if available.
|
||||
|
||||
Code ported originally by xhlove to Python from shaka-player.
|
||||
Has since been improved upon by rlaphoenix using pymp4 and
|
||||
pycaption functions.
|
||||
"""
|
||||
captions = CaptionList()
|
||||
|
||||
# init:
|
||||
saw_wvtt_box = False
|
||||
timescale = None
|
||||
language = None
|
||||
|
||||
# media:
|
||||
# > tfhd
|
||||
default_duration = None
|
||||
# > tfdt
|
||||
saw_tfdt_box = False
|
||||
base_time = 0
|
||||
# > trun
|
||||
saw_trun_box = False
|
||||
samples = []
|
||||
|
||||
def flatten_boxes(box: Container) -> Iterable[Container]:
|
||||
for child in box:
|
||||
if hasattr(child, "children"):
|
||||
yield from flatten_boxes(child.children)
|
||||
del child["children"]
|
||||
if hasattr(child, "entries"):
|
||||
yield from flatten_boxes(child.entries)
|
||||
del child["entries"]
|
||||
# some boxes (mainly within 'entries') uses format not type
|
||||
child["type"] = child.get("type") or child.get("format")
|
||||
yield child
|
||||
|
||||
for box in flatten_boxes(MP4.parse_stream(BytesIO(data))):
|
||||
# init
|
||||
if box.type == b"mdhd":
|
||||
timescale = box.timescale
|
||||
language = box.language
|
||||
|
||||
if box.type == b"wvtt":
|
||||
saw_wvtt_box = True
|
||||
|
||||
# media
|
||||
if box.type == b"styp":
|
||||
# essentially the start of each segment
|
||||
# media var resets
|
||||
# > tfhd
|
||||
default_duration = None
|
||||
# > tfdt
|
||||
saw_tfdt_box = False
|
||||
base_time = 0
|
||||
# > trun
|
||||
saw_trun_box = False
|
||||
samples = []
|
||||
|
||||
if box.type == b"tfhd":
|
||||
if box.flags.default_sample_duration_present:
|
||||
default_duration = box.default_sample_duration
|
||||
|
||||
if box.type == b"tfdt":
|
||||
saw_tfdt_box = True
|
||||
base_time = box.baseMediaDecodeTime
|
||||
|
||||
if box.type == b"trun":
|
||||
saw_trun_box = True
|
||||
samples = box.sample_info
|
||||
|
||||
if box.type == b"mdat":
|
||||
if not timescale:
|
||||
raise ValueError("Timescale was not found in the Segmented WebVTT.")
|
||||
if not saw_wvtt_box:
|
||||
raise ValueError("The WVTT box was not found in the Segmented WebVTT.")
|
||||
if not saw_tfdt_box:
|
||||
raise ValueError("The TFDT box was not found in the Segmented WebVTT.")
|
||||
if not saw_trun_box:
|
||||
raise ValueError("The TRUN box was not found in the Segmented WebVTT.")
|
||||
|
||||
vttc_boxes = MP4.parse_stream(BytesIO(box.data))
|
||||
current_time = base_time + period_start
|
||||
|
||||
for sample, vttc_box in zip(samples, vttc_boxes):
|
||||
duration = sample.sample_duration or default_duration
|
||||
if sample.sample_composition_time_offsets:
|
||||
current_time += sample.sample_composition_time_offsets
|
||||
|
||||
start_time = current_time
|
||||
end_time = current_time + (duration or 0)
|
||||
current_time = end_time
|
||||
|
||||
if vttc_box.type == b"vtte":
|
||||
# vtte is a vttc that's empty, skip
|
||||
continue
|
||||
|
||||
layout: Optional[Layout] = None
|
||||
nodes: list[CaptionNode] = []
|
||||
|
||||
for cue_box in MP4.parse_stream(BytesIO(vttc_box.data)):
|
||||
if cue_box.type == b"vsid":
|
||||
# this is a V(?) Source ID box, we don't care
|
||||
continue
|
||||
cue_data = cue_box.data.decode("utf8")
|
||||
if cue_box.type == b"sttg":
|
||||
layout = Layout(webvtt_positioning=cue_data)
|
||||
elif cue_box.type == b"payl":
|
||||
nodes.extend([
|
||||
node
|
||||
for line in cue_data.split("\n")
|
||||
for node in [
|
||||
CaptionNode.create_text(WebVTTReader()._decode(line)),
|
||||
CaptionNode.create_break()
|
||||
]
|
||||
])
|
||||
nodes.pop()
|
||||
|
||||
if nodes:
|
||||
caption = Caption(
|
||||
start=start_time * timescale, # as microseconds
|
||||
end=end_time * timescale,
|
||||
nodes=nodes,
|
||||
layout_info=layout
|
||||
)
|
||||
p_caption = captions[-1] if captions else None
|
||||
if p_caption and caption.start == p_caption.end and str(caption.nodes) == str(p_caption.nodes):
|
||||
# it's a duplicate, but lets take its end time
|
||||
p_caption.end = caption.end
|
||||
continue
|
||||
captions.append(caption)
|
||||
|
||||
return captions, language
|
||||
|
||||
def strip_hearing_impaired(self) -> None:
|
||||
"""
|
||||
Strip captions for hearing impaired (SDH).
|
||||
It uses SubtitleEdit if available, otherwise filter-subs.
|
||||
"""
|
||||
if not self.path or not self.path.exists():
|
||||
raise ValueError("You must download the subtitle track first.")
|
||||
|
||||
executable = get_binary_path("SubtitleEdit")
|
||||
if executable:
|
||||
subprocess.run([
|
||||
executable,
|
||||
"/Convert", self.path, "srt",
|
||||
"/overwrite",
|
||||
"/RemoveTextForHI"
|
||||
], check=True)
|
||||
# Remove UTF-8 Byte Order Marks
|
||||
self.path.write_text(
|
||||
self.path.read_text(encoding="utf-8-sig"),
|
||||
encoding="utf8"
|
||||
)
|
||||
else:
|
||||
sub = Subtitles(self.path)
|
||||
sub.filter(
|
||||
rm_fonts=True,
|
||||
rm_ast=True,
|
||||
rm_music=True,
|
||||
rm_effects=True,
|
||||
rm_names=True,
|
||||
rm_author=True
|
||||
)
|
||||
sub.save()
|
||||
|
||||
def download(self, *args, **kwargs) -> Path:
|
||||
save_path = super().download(*args, **kwargs)
|
||||
if self.codec not in (Subtitle.Codec.SubRip, Subtitle.Codec.SubStationAlphav4):
|
||||
caption_set = self.parse(save_path.read_bytes(), self.codec)
|
||||
self.merge_same_cues(caption_set)
|
||||
srt = pycaption.SRTWriter().write(caption_set)
|
||||
# NowTV sometimes has this, when it isn't, causing mux problems
|
||||
srt = srt.replace("MULTI-LANGUAGE SRT\n", "")
|
||||
save_path.write_text(srt, encoding="utf8")
|
||||
self.codec = Subtitle.Codec.SubRip
|
||||
self.move(self.path.with_suffix(".srt"))
|
||||
return save_path
|
||||
|
||||
def __str__(self) -> str:
|
||||
return " | ".join(filter(bool, [
|
||||
"SUB",
|
||||
f"[{self.codec.value}]",
|
||||
str(self.language),
|
||||
self.get_track_name()
|
||||
]))
|
||||
|
||||
|
||||
__ALL__ = (Subtitle,)
|
||||
Reference in New Issue
Block a user