Completely rewrite downloading system

The new system now downloads and decrypts segments individually instead of downloading all segments, merging them, and then decrypting. Overall the download system now acts more like a normal player. This fixes #23 as the new HLS download system detects changes in keys and init segments as segments are downloaded. DASH still only supports one period, and one period only, but hopefully I can change that in the future. Downloading code is now also moved from the Track classes to the manifest classes. Download progress is now also actually helpful for segmented downloads (all HLS, and most DASH streams). It uses TQDM to show a progress bar based on how many segments it needs to download, and how fast it downloads them. There's only one down side currently. Downloading of segmented videos no longer have the benefit of aria2c's -j parameter. Where it can download n URLs concurrently. Aria2c is still used but only -x and -s is going to make a difference. In the future I will make HLS and DASH download in a multi-threaded way, sort of a manual version of -j.
2023-02-21 05:42:00 +00:00
parent c925cb8af9
commit 42aaa03941
6 changed files with 591 additions and 436 deletions
--- a/devine/core/tracks/track.py
+++ b/devine/core/tracks/track.py
@@ -1,21 +1,17 @@
 from __future__ import annotations

-import asyncio
-import logging
 import re
 import shutil
 import subprocess
 from enum import Enum
 from pathlib import Path
 from typing import Any, Callable, Iterable, Optional, Union
-from urllib.parse import urljoin

 import m3u8
 import requests
 from langcodes import Language

 from devine.core.constants import TERRITORY_MAP
-from devine.core.downloaders import aria2c
 from devine.core.drm import DRM_T
 from devine.core.utilities import get_binary_path

@@ -133,136 +129,6 @@ class Track:
                # we only want the first chunk
                return chunk

-    def download(self, out: Path, name_template: str = "{type}_{id}", headers: Optional[dict] = None,
-                 proxy: Optional[str] = None) -> Path:
-        """
-        Download the Track and apply any necessary post-edits like Subtitle conversion.
-
-        Parameters:
-            out: Output Directory Path for the downloaded track.
-            name_template: Override the default filename template.
-                Must contain both `{type}` and `{id}` variables.
-            headers: Headers to use when downloading.
-            proxy: Proxy to use when downloading.
-
-        Returns:
-            Where the file was saved, as a Path object.
-        """
-        if out.is_file():
-            raise ValueError("Path must be to a directory and not a file")
-
-        log = logging.getLogger("download")
-
-        out.mkdir(parents=True, exist_ok=True)
-
-        file_name = name_template.format(
-            type=self.__class__.__name__,
-            id=self.id
-        )
-
-        # we must use .mp4 on tracks:
-        # - as shaka-packager expects mp4 input and mp4 output
-        # - and mkvtoolnix would try to parse the file in raw-bitstream
-        save_path = (out / file_name).with_suffix(".mp4")
-        if self.__class__.__name__ == "Subtitle":
-            save_path = save_path.with_suffix(f".{self.codec.extension}")
-
-        # these would be files like .decrypted, .repack and such.
-        # we cannot trust that these files were not interrupted while writing to disc
-        # lets just delete them before re-attempting a download
-        for existing_file in save_path.parent.glob(f"{save_path.stem}.*{save_path.suffix}"):
-            existing_file.unlink()
-        save_path.with_suffix(".srt").unlink(missing_ok=True)
-
-        if self.descriptor == self.Descriptor.M3U:
-            master = m3u8.loads(
-                requests.get(
-                    self.url,
-                    headers=headers,
-                    proxies={"all": proxy} if self.needs_proxy and proxy else None
-                ).text,
-                uri=self.url
-            )
-
-            if not master.segments:
-                raise ValueError("Track URI (an M3U8) has no segments...")
-
-            if all(segment.uri == master.segments[0].uri for segment in master.segments):
-                # all segments use the same file, presumably an EXT-X-BYTERANGE M3U (FUNI)
-                # TODO: This might be a risky way to deal with these kinds of Playlists
-                #       What if there's an init section, or one segment is reusing a byte-range
-                segment = master.segments[0]
-                if not re.match("^https?://", segment.uri):
-                    segment.uri = urljoin(segment.base_uri, segment.uri)
-                self.url = segment.uri
-                self.descriptor = self.Descriptor.URL
-            else:
-                has_init = False
-                segments = []
-                for segment in master.segments:
-                    # merge base uri with uri where needed in both normal and init segments
-                    if not re.match("^https?://", segment.uri):
-                        segment.uri = segment.base_uri + segment.uri
-                    if segment.init_section and not re.match("^https?://", segment.init_section.uri):
-                        segment.init_section.uri = segment.init_section.base_uri + segment.init_section.uri
-
-                    if segment.discontinuity:
-                        has_init = False
-
-                    # skip segments we don't want to download (e.g., bumpers, dub cards)
-                    if callable(self.OnSegmentFilter) and self.OnSegmentFilter(segment):
-                        continue
-
-                    if segment.init_section and not has_init:
-                        segments.append(segment.init_section.uri)
-                        has_init = True
-                    segments.append(segment.uri)
-                self.url = list(dict.fromkeys(segments))
-
-        is_segmented = isinstance(self.url, list) and len(self.url) > 1
-        segments_dir = save_path.with_name(save_path.name + "_segments")
-
-        attempts = 1
-        while True:
-            try:
-                asyncio.run(aria2c(
-                    self.url,
-                    [save_path, segments_dir][is_segmented],
-                    headers,
-                    proxy if self.needs_proxy else None
-                ))
-                break
-            except subprocess.CalledProcessError:
-                log.info(f" - Download attempt {attempts} failed, {['retrying', 'stopping'][attempts == 3]}...")
-                if attempts == 3:
-                    raise
-                attempts += 1
-
-        if is_segmented:
-            # merge the segments together
-            with open(save_path, "wb") as f:
-                for file in sorted(segments_dir.iterdir()):
-                    data = file.read_bytes()
-                    # fix audio decryption
-                    data = re.sub(b"(tfhd\x00\x02\x00\x1a\x00\x00\x00\x01\x00\x00\x00)\x02", b"\\g<1>\x01", data)
-                    f.write(data)
-                    file.unlink()  # delete, we don't need it anymore
-            segments_dir.rmdir()
-
-        self.path = save_path
-
-        if self.path.stat().st_size <= 3:  # Empty UTF-8 BOM == 3 bytes
-            raise IOError(
-                "Download failed, the downloaded file is empty. "
-                f"This {'was' if self.needs_proxy else 'was not'} downloaded with a proxy." +
-                (
-                    " Perhaps you need to set `needs_proxy` as True to use the proxy for this track."
-                    if not self.needs_proxy else ""
-                )
-            )
-
-        return self.path
-
    def delete(self) -> None:
        if self.path:
            self.path.unlink()