Initial commit

2023-02-06 02:33:09 +00:00
commit 7fd87b8aa2
71 changed files with 10614 additions and 0 deletions
--- a/devine/core/tracks/track.py
+++ b/devine/core/tracks/track.py
@@ -0,0 +1,335 @@
+import asyncio
+import logging
+import re
+import subprocess
+from enum import Enum
+from pathlib import Path
+from typing import Any, Callable, Iterable, Optional, Union
+from urllib.parse import urljoin
+
+import m3u8
+import requests
+from langcodes import Language
+
+from devine.core.constants import TERRITORY_MAP
+from devine.core.downloaders import aria2c
+from devine.core.drm import DRM_T
+from devine.core.utilities import get_binary_path
+
+
+class Track:
+    class DRM(Enum):
+        pass
+
+    class Descriptor(Enum):
+        URL = 1  # Direct URL, nothing fancy
+        M3U = 2  # https://en.wikipedia.org/wiki/M3U (and M3U8)
+        MPD = 3  # https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
+
+    def __init__(
+        self,
+        id_: str,
+        url: Union[str, list[str]],
+        language: Union[Language, str],
+        is_original_lang: bool = False,
+        descriptor: Descriptor = Descriptor.URL,
+        needs_proxy: bool = False,
+        needs_repack: bool = False,
+        drm: Optional[Iterable[DRM_T]] = None,
+        edition: Optional[str] = None,
+        extra: Optional[Any] = None
+    ) -> None:
+        self.id = id_
+        self.url = url
+        # required basic metadata
+        self.language = Language.get(language)
+        self.is_original_lang = bool(is_original_lang)
+        # optional io metadata
+        self.descriptor = descriptor
+        self.needs_proxy = bool(needs_proxy)
+        self.needs_repack = bool(needs_repack)
+        # drm
+        self.drm = drm
+        # extra data
+        self.edition: str = edition
+        self.extra: Any = extra or {}  # allow anything for extra, but default to a dict
+
+        # events
+        self.OnSegmentFilter: Optional[Callable] = None
+        self.OnDownloaded: Optional[Callable] = None
+        self.OnDecrypted: Optional[Callable] = None
+        self.OnRepacked: Optional[Callable] = None
+
+        # should only be set internally
+        self.path: Optional[Path] = None
+
+    def __repr__(self) -> str:
+        return "{name}({items})".format(
+            name=self.__class__.__name__,
+            items=", ".join([f"{k}={repr(v)}" for k, v in self.__dict__.items()])
+        )
+
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, Track) and self.id == other.id
+
+    def get_track_name(self) -> Optional[str]:
+        """Return the base Track Name. This may be enhanced in sub-classes."""
+        if (self.language.language or "").lower() == (self.language.territory or "").lower():
+            self.language.territory = None  # e.g. en-en, de-DE
+        if self.language.territory == "US":
+            self.language.territory = None
+        reduced = self.language.simplify_script()
+        extra_parts = []
+        if reduced.script is not None:
+            extra_parts.append(reduced.script_name(max_distance=25))
+        if reduced.territory is not None:
+            territory = reduced.territory_name(max_distance=25)
+            extra_parts.append(TERRITORY_MAP.get(territory, territory))
+        return ", ".join(extra_parts) or None
+
+    def get_init_segment(self, session: Optional[requests.Session] = None) -> bytes:
+        """
+        Get the Track's Initial Segment Data Stream.
+        If the Track URL is not detected to be an init segment, it will download
+        up to the first 20,000 (20KB) bytes only.
+        """
+        if not session:
+            session = requests.Session()
+
+        url = None
+        is_init_stream = False
+
+        if self.descriptor == self.Descriptor.M3U:
+            master = m3u8.loads(session.get(self.url).text, uri=self.url)
+            for segment in master.segments:
+                if not segment.init_section:
+                    continue
+                # skip any segment that would be skipped from the download
+                # as we cant consider these a true initial segment
+                if callable(self.OnSegmentFilter) and self.OnSegmentFilter(segment):
+                    continue
+                url = ("" if re.match("^https?://", segment.init_section.uri) else segment.init_section.base_uri)
+                url += segment.init_section.uri
+                is_init_stream = True
+                break
+
+        if not url:
+            url = self.url
+
+        if isinstance(url, list):
+            url = url[0]
+            is_init_stream = True
+
+        if is_init_stream:
+            return session.get(url).content
+
+        # likely a full single-file download, get first 20k bytes
+        with session.get(url, stream=True) as s:
+            # assuming enough to contain the pssh/kid
+            for chunk in s.iter_content(20000):
+                # we only want the first chunk
+                return chunk
+
+    def download(self, out: Path, name_template: str = "{type}_{id}", headers: Optional[dict] = None,
+                 proxy: Optional[str] = None) -> Path:
+        """
+        Download the Track and apply any necessary post-edits like Subtitle conversion.
+
+        Parameters:
+            out: Output Directory Path for the downloaded track.
+            name_template: Override the default filename template.
+                Must contain both `{type}` and `{id}` variables.
+            headers: Headers to use when downloading.
+            proxy: Proxy to use when downloading.
+
+        Returns:
+            Where the file was saved, as a Path object.
+        """
+        if out.is_file():
+            raise ValueError("Path must be to a directory and not a file")
+
+        log = logging.getLogger("download")
+
+        out.mkdir(parents=True, exist_ok=True)
+
+        file_name = name_template.format(
+            type=self.__class__.__name__,
+            id=self.id
+        )
+
+        # we must use .mp4 on tracks:
+        # - as shaka-packager expects mp4 input and mp4 output
+        # - and mkvtoolnix would try to parse the file in raw-bitstream
+        save_path = (out / file_name).with_suffix(".mp4")
+        if self.__class__.__name__ == "Subtitle":
+            save_path = save_path.with_suffix(f".{self.codec.extension}")
+
+        # these would be files like .decrypted, .repack and such.
+        # we cannot trust that these files were not interrupted while writing to disc
+        # lets just delete them before re-attempting a download
+        for existing_file in save_path.parent.glob(f"{save_path.stem}.*{save_path.suffix}"):
+            existing_file.unlink()
+        save_path.with_suffix(".srt").unlink(missing_ok=True)
+
+        if self.descriptor == self.Descriptor.M3U:
+            master = m3u8.loads(
+                requests.get(
+                    self.url,
+                    headers=headers,
+                    proxies={"all": proxy} if self.needs_proxy and proxy else None
+                ).text,
+                uri=self.url
+            )
+
+            if not master.segments:
+                raise ValueError("Track URI (an M3U8) has no segments...")
+
+            if all(segment.uri == master.segments[0].uri for segment in master.segments):
+                # all segments use the same file, presumably an EXT-X-BYTERANGE M3U (FUNI)
+                # TODO: This might be a risky way to deal with these kinds of Playlists
+                #       What if there's an init section, or one segment is reusing a byte-range
+                segment = master.segments[0]
+                if not re.match("^https?://", segment.uri):
+                    segment.uri = urljoin(segment.base_uri, segment.uri)
+                self.url = segment.uri
+                self.descriptor = self.Descriptor.URL
+            else:
+                has_init = False
+                segments = []
+                for segment in master.segments:
+                    # merge base uri with uri where needed in both normal and init segments
+                    if not re.match("^https?://", segment.uri):
+                        segment.uri = segment.base_uri + segment.uri
+                    if segment.init_section and not re.match("^https?://", segment.init_section.uri):
+                        segment.init_section.uri = segment.init_section.base_uri + segment.init_section.uri
+
+                    if segment.discontinuity:
+                        has_init = False
+
+                    # skip segments we don't want to download (e.g., bumpers, dub cards)
+                    if callable(self.OnSegmentFilter) and self.OnSegmentFilter(segment):
+                        continue
+
+                    if segment.init_section and not has_init:
+                        segments.append(segment.init_section.uri)
+                        has_init = True
+                    segments.append(segment.uri)
+                self.url = list(dict.fromkeys(segments))
+
+        is_segmented = isinstance(self.url, list) and len(self.url) > 1
+        segments_dir = save_path.with_name(save_path.name + "_segments")
+
+        attempts = 1
+        while True:
+            try:
+                asyncio.run(aria2c(
+                    self.url,
+                    [save_path, segments_dir][is_segmented],
+                    headers,
+                    proxy if self.needs_proxy else None
+                ))
+                break
+            except subprocess.CalledProcessError:
+                log.info(f" - Download attempt {attempts} failed, {['retrying', 'stopping'][attempts == 3]}...")
+                if attempts == 3:
+                    raise
+                attempts += 1
+
+        if is_segmented:
+            # merge the segments together
+            with open(save_path, "wb") as f:
+                for file in sorted(segments_dir.iterdir()):
+                    data = file.read_bytes()
+                    # Apple TV+ needs this done to fix audio decryption
+                    data = re.sub(b"(tfhd\x00\x02\x00\x1a\x00\x00\x00\x01\x00\x00\x00)\x02", b"\\g<1>\x01", data)
+                    f.write(data)
+                    file.unlink()  # delete, we don't need it anymore
+            segments_dir.rmdir()
+
+        self.path = save_path
+
+        if self.path.stat().st_size <= 3:  # Empty UTF-8 BOM == 3 bytes
+            raise IOError(
+                "Download failed, the downloaded file is empty. "
+                f"This {'was' if self.needs_proxy else 'was not'} downloaded with a proxy." +
+                (
+                    " Perhaps you need to set `needs_proxy` as True to use the proxy for this track."
+                    if not self.needs_proxy else ""
+                )
+            )
+
+        return self.path
+
+    def delete(self) -> None:
+        if self.path:
+            self.path.unlink()
+            self.path = None
+
+    def repackage(self) -> None:
+        if not self.path or not self.path.exists():
+            raise ValueError("Cannot repackage a Track that has not been downloaded.")
+
+        executable = get_binary_path("ffmpeg")
+        if not executable:
+            raise EnvironmentError("FFmpeg executable \"ffmpeg\" was not found but is required for this call.")
+
+        repacked_path = self.path.with_suffix(f".repack{self.path.suffix}")
+
+        def _ffmpeg(extra_args: list[str] = None):
+            subprocess.run(
+                [
+                    executable, "-hide_banner",
+                    "-loglevel", "error",
+                    "-i", self.path,
+                    *(extra_args or []),
+                    # Following are very important!
+                    "-map_metadata", "-1",  # don't transfer metadata to output file
+                    "-fflags", "bitexact",  # only have minimal tag data, reproducible mux
+                    "-codec", "copy",
+                    str(repacked_path)
+                ],
+                check=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE
+            )
+
+        try:
+            _ffmpeg()
+        except subprocess.CalledProcessError as e:
+            if b"Malformed AAC bitstream detected" in e.stderr:
+                # e.g., TruTV's dodgy encodes
+                _ffmpeg(["-y", "-bsf:a", "aac_adtstoasc"])
+            else:
+                raise
+
+        self.swap(repacked_path)
+
+    def move(self, target: Union[str, Path]) -> bool:
+        """
+        Move the Track's file from current location, to target location.
+        This will overwrite anything at the target path.
+        """
+        if not self.path:
+            return False
+        target = Path(target)
+        ok = self.path.rename(target).resolve() == target.resolve()
+        if ok:
+            self.path = target
+        return ok
+
+    def swap(self, target: Union[str, Path]) -> bool:
+        """
+        Swaps the Track's file with the Target file. The current Track's file is deleted.
+        Returns False if the Track is not yet downloaded, or the target path does not exist.
+        """
+        target = Path(target)
+        if not target.exists() or not self.path:
+            return False
+        self.path.unlink()
+        ok = target.rename(self.path) == self.path
+        if not ok:
+            return False
+        return self.move(target)
+
+
+__ALL__ = (Track,)