Native concurrent downloading of fragments (#166)

* Option `--concurrent-fragments` (`-N`) to set the number of threads

Related: #165

Known issues:
* When receiving Ctrl+C, the process will exit only after finishing the currently downloading fragments
* The download progress shows the speed of only one thread

Authored by shirt-dev
This commit is contained in:
shirt 2021-03-12 23:46:58 -05:00 committed by GitHub
parent 0a473f2f0f
commit 4cf1e5d2f9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 250 additions and 104 deletions

1
.gitignore vendored
View File

@ -60,6 +60,7 @@ yt-dlp.zip
*.mkv *.mkv
*.swf *.swf
*.part *.part
*.part-*
*.ytdl *.ytdl
*.dump *.dump
*.frag *.frag

View File

@ -297,6 +297,8 @@ Then simply run `make`. You can also run `make yt-dlp` instead to compile only t
--no-include-ads Do not download advertisements (default) --no-include-ads Do not download advertisements (default)
## Download Options: ## Download Options:
-N, --concurrent-fragments N Number of fragments to download
concurrently (default is 1)
-r, --limit-rate RATE Maximum download rate in bytes per second -r, --limit-rate RATE Maximum download rate in bytes per second
(e.g. 50K or 4.2M) (e.g. 50K or 4.2M)
-R, --retries RETRIES Number of retries (default is 10), or -R, --retries RETRIES Number of retries (default is 10), or

View File

@ -180,6 +180,8 @@ def _real_main(argv=None):
if opts.overwrites: if opts.overwrites:
# --yes-overwrites implies --no-continue # --yes-overwrites implies --no-continue
opts.continue_dl = False opts.continue_dl = False
if opts.concurrent_fragment_downloads <= 0:
raise ValueError('Concurrent fragments must be positive')
def parse_retries(retries, name=''): def parse_retries(retries, name=''):
if retries in ('inf', 'infinite'): if retries in ('inf', 'infinite'):
@ -463,6 +465,7 @@ def _real_main(argv=None):
'extractor_retries': opts.extractor_retries, 'extractor_retries': opts.extractor_retries,
'skip_unavailable_fragments': opts.skip_unavailable_fragments, 'skip_unavailable_fragments': opts.skip_unavailable_fragments,
'keep_fragments': opts.keep_fragments, 'keep_fragments': opts.keep_fragments,
'concurrent_fragment_downloads': opts.concurrent_fragment_downloads,
'buffersize': opts.buffersize, 'buffersize': opts.buffersize,
'noresizebuffer': opts.noresizebuffer, 'noresizebuffer': opts.noresizebuffer,
'http_chunk_size': opts.http_chunk_size, 'http_chunk_size': opts.http_chunk_size,

View File

@ -1,11 +1,18 @@
from __future__ import unicode_literals from __future__ import unicode_literals
try:
import concurrent.futures
can_threaded_download = True
except ImportError:
can_threaded_download = False
from ..downloader import _get_real_downloader from ..downloader import _get_real_downloader
from .fragment import FragmentFD from .fragment import FragmentFD
from ..compat import compat_urllib_error from ..compat import compat_urllib_error
from ..utils import ( from ..utils import (
DownloadError, DownloadError,
sanitize_open,
urljoin, urljoin,
) )
@ -49,11 +56,29 @@ class DashSegmentsFD(FragmentFD):
assert fragment_base_url assert fragment_base_url
fragment_url = urljoin(fragment_base_url, fragment['path']) fragment_url = urljoin(fragment_base_url, fragment['path'])
if real_downloader:
fragments_to_download.append({ fragments_to_download.append({
'frag_index': frag_index,
'index': i,
'url': fragment_url, 'url': fragment_url,
}) })
continue
if real_downloader:
info_copy = info_dict.copy()
info_copy['fragments'] = fragments_to_download
fd = real_downloader(self.ydl, self.params)
# TODO: Make progress updates work without hooking twice
# for ph in self._progress_hooks:
# fd.add_progress_hook(ph)
success = fd.real_download(filename, info_copy)
if not success:
return False
else:
def download_fragment(fragment):
i = fragment['index']
frag_index = fragment['frag_index']
fragment_url = fragment['url']
ctx['fragment_index'] = frag_index
# In DASH, the first segment contains necessary headers to # In DASH, the first segment contains necessary headers to
# generate a valid MP4 file, so always abort for the first segment # generate a valid MP4 file, so always abort for the first segment
@ -63,8 +88,7 @@ class DashSegmentsFD(FragmentFD):
try: try:
success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)
if not success: if not success:
return False return False, frag_index
self._append_fragment(ctx, frag_content)
break break
except compat_urllib_error.HTTPError as err: except compat_urllib_error.HTTPError as err:
# YouTube may often return 404 HTTP error for a fragment causing the # YouTube may often return 404 HTTP error for a fragment causing the
@ -80,27 +104,73 @@ class DashSegmentsFD(FragmentFD):
# Don't retry fragment if error occurred during HTTP downloading # Don't retry fragment if error occurred during HTTP downloading
# itself since it has own retry settings # itself since it has own retry settings
if not fatal: if not fatal:
self.report_skip_fragment(frag_index)
break break
raise raise
if count > fragment_retries: if count > fragment_retries:
if not fatal: if not fatal:
self.report_skip_fragment(frag_index) return False, frag_index
continue
self.report_error('giving up after %s fragment retries' % fragment_retries) self.report_error('giving up after %s fragment retries' % fragment_retries)
return False return False, frag_index
if real_downloader: return frag_content, frag_index
info_copy = info_dict.copy()
info_copy['fragments'] = fragments_to_download def append_fragment(frag_content, frag_index):
fd = real_downloader(self.ydl, self.params) if frag_content:
# TODO: Make progress updates work without hooking twice fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], frag_index)
# for ph in self._progress_hooks: try:
# fd.add_progress_hook(ph) file, frag_sanitized = sanitize_open(fragment_filename, 'rb')
success = fd.real_download(filename, info_copy) ctx['fragment_filename_sanitized'] = frag_sanitized
if not success: file.close()
self._append_fragment(ctx, frag_content)
return True
except FileNotFoundError:
if skip_unavailable_fragments:
self.report_skip_fragment(frag_index)
return True
else:
self.report_error(
'fragment %s not found, unable to continue' % frag_index)
return False return False
else: else:
if skip_unavailable_fragments:
self.report_skip_fragment(frag_index)
return True
else:
self.report_error(
'fragment %s not found, unable to continue' % frag_index)
return False
max_workers = self.params.get('concurrent_fragment_downloads', 1)
if can_threaded_download and max_workers > 1:
self.report_warning('The download speed shown is only of one thread. This is a known issue')
with concurrent.futures.ThreadPoolExecutor(max_workers) as pool:
futures = [pool.submit(download_fragment, fragment) for fragment in fragments_to_download]
# timeout must be 0 to return instantly
done, not_done = concurrent.futures.wait(futures, timeout=0)
try:
while not_done:
# Check every 1 second for KeyboardInterrupt
freshly_done, not_done = concurrent.futures.wait(not_done, timeout=1)
done |= freshly_done
except KeyboardInterrupt:
for future in not_done:
future.cancel()
# timeout must be none to cancel
concurrent.futures.wait(not_done, timeout=None)
raise KeyboardInterrupt
results = [future.result() for future in futures]
for frag_content, frag_index in results:
result = append_fragment(frag_content, frag_index)
if not result:
return False
else:
for fragment in fragments_to_download:
frag_content, frag_index = download_fragment(fragment)
result = append_fragment(frag_content, frag_index)
if not result:
return False
self._finish_frag_download(ctx) self._finish_frag_download(ctx)
return True return True

View File

@ -126,7 +126,7 @@ class ExternalFD(FileDownloader):
file_list = [] file_list = []
dest, _ = sanitize_open(tmpfilename, 'wb') dest, _ = sanitize_open(tmpfilename, 'wb')
for i, fragment in enumerate(info_dict['fragments']): for i, fragment in enumerate(info_dict['fragments']):
file = '%s_%s.frag' % (tmpfilename, i) file = '%s-Frag%d' % (tmpfilename, i)
decrypt_info = fragment.get('decrypt_info') decrypt_info = fragment.get('decrypt_info')
src, _ = sanitize_open(file, 'rb') src, _ = sanitize_open(file, 'rb')
if decrypt_info: if decrypt_info:
@ -274,7 +274,7 @@ class Aria2cFD(ExternalFD):
url_list_file = '%s.frag.urls' % tmpfilename url_list_file = '%s.frag.urls' % tmpfilename
url_list = [] url_list = []
for i, fragment in enumerate(info_dict['fragments']): for i, fragment in enumerate(info_dict['fragments']):
tmpsegmentname = '%s_%s.frag' % (os.path.basename(tmpfilename), i) tmpsegmentname = '%s-Frag%d' % (os.path.basename(tmpfilename), i)
url_list.append('%s\n\tout=%s' % (fragment['url'], tmpsegmentname)) url_list.append('%s\n\tout=%s' % (fragment['url'], tmpsegmentname))
stream, _ = sanitize_open(url_list_file, 'wb') stream, _ = sanitize_open(url_list_file, 'wb')
stream.write('\n'.join(url_list).encode('utf-8')) stream.write('\n'.join(url_list).encode('utf-8'))

View File

@ -7,6 +7,11 @@ try:
can_decrypt_frag = True can_decrypt_frag = True
except ImportError: except ImportError:
can_decrypt_frag = False can_decrypt_frag = False
try:
import concurrent.futures
can_threaded_download = True
except ImportError:
can_threaded_download = False
from ..downloader import _get_real_downloader from ..downloader import _get_real_downloader
from .fragment import FragmentFD from .fragment import FragmentFD
@ -19,6 +24,7 @@ from ..compat import (
) )
from ..utils import ( from ..utils import (
parse_m3u8_attributes, parse_m3u8_attributes,
sanitize_open,
update_url_query, update_url_query,
) )
@ -151,7 +157,6 @@ class HlsFD(FragmentFD):
ad_frag_next = False ad_frag_next = False
for line in s.splitlines(): for line in s.splitlines():
line = line.strip() line = line.strip()
download_frag = False
if line: if line:
if not line.startswith('#'): if not line.startswith('#'):
if format_index and discontinuity_count != format_index: if format_index and discontinuity_count != format_index:
@ -168,13 +173,13 @@ class HlsFD(FragmentFD):
if extra_query: if extra_query:
frag_url = update_url_query(frag_url, extra_query) frag_url = update_url_query(frag_url, extra_query)
if real_downloader:
fragments.append({ fragments.append({
'frag_index': frag_index,
'url': frag_url, 'url': frag_url,
'decrypt_info': decrypt_info, 'decrypt_info': decrypt_info,
'byte_range': byte_range,
'media_sequence': media_sequence,
}) })
continue
download_frag = True
elif line.startswith('#EXT-X-MAP'): elif line.startswith('#EXT-X-MAP'):
if format_index and discontinuity_count != format_index: if format_index and discontinuity_count != format_index:
@ -191,12 +196,14 @@ class HlsFD(FragmentFD):
else compat_urlparse.urljoin(man_url, map_info.get('URI'))) else compat_urlparse.urljoin(man_url, map_info.get('URI')))
if extra_query: if extra_query:
frag_url = update_url_query(frag_url, extra_query) frag_url = update_url_query(frag_url, extra_query)
if real_downloader:
fragments.append({ fragments.append({
'frag_index': frag_index,
'url': frag_url, 'url': frag_url,
'decrypt_info': decrypt_info, 'decrypt_info': decrypt_info,
'byte_range': byte_range,
'media_sequence': media_sequence
}) })
continue
if map_info.get('BYTERANGE'): if map_info.get('BYTERANGE'):
splitted_byte_range = map_info.get('BYTERANGE').split('@') splitted_byte_range = map_info.get('BYTERANGE').split('@')
@ -205,7 +212,6 @@ class HlsFD(FragmentFD):
'start': sub_range_start, 'start': sub_range_start,
'end': sub_range_start + int(splitted_byte_range[0]), 'end': sub_range_start + int(splitted_byte_range[0]),
} }
download_frag = True
elif line.startswith('#EXT-X-KEY'): elif line.startswith('#EXT-X-KEY'):
decrypt_url = decrypt_info.get('URI') decrypt_url = decrypt_info.get('URI')
@ -236,53 +242,12 @@ class HlsFD(FragmentFD):
ad_frag_next = False ad_frag_next = False
elif line.startswith('#EXT-X-DISCONTINUITY'): elif line.startswith('#EXT-X-DISCONTINUITY'):
discontinuity_count += 1 discontinuity_count += 1
if download_frag:
count = 0
headers = info_dict.get('http_headers', {})
if byte_range:
headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1)
while count <= fragment_retries:
try:
success, frag_content = self._download_fragment(
ctx, frag_url, info_dict, headers)
if not success:
return False
break
except compat_urllib_error.HTTPError as err:
# Unavailable (possibly temporary) fragments may be served.
# First we try to retry then either skip or abort.
# See https://github.com/ytdl-org/youtube-dl/issues/10165,
# https://github.com/ytdl-org/youtube-dl/issues/10448).
count += 1
if count <= fragment_retries:
self.report_retry_fragment(err, frag_index, count, fragment_retries)
if count > fragment_retries:
if skip_unavailable_fragments:
i += 1 i += 1
media_sequence += 1 media_sequence += 1
self.report_skip_fragment(frag_index)
continue
self.report_error(
'giving up after %s fragment retries' % fragment_retries)
return False
if decrypt_info['METHOD'] == 'AES-128':
iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence)
decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(
self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read()
# Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block
# size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded,
# not what it decrypts to.
if not test:
frag_content = AES.new(
decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
self._append_fragment(ctx, frag_content)
# We only download the first fragment during the test # We only download the first fragment during the test
if test: if test:
break fragments = [fragments[0] if fragments else None]
i += 1
media_sequence += 1
if real_downloader: if real_downloader:
info_copy = info_dict.copy() info_copy = info_dict.copy()
@ -295,5 +260,106 @@ class HlsFD(FragmentFD):
if not success: if not success:
return False return False
else: else:
def download_fragment(fragment):
frag_index = fragment['frag_index']
frag_url = fragment['url']
decrypt_info = fragment['decrypt_info']
byte_range = fragment['byte_range']
media_sequence = fragment['media_sequence']
ctx['fragment_index'] = frag_index
count = 0
headers = info_dict.get('http_headers', {})
if byte_range:
headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1)
while count <= fragment_retries:
try:
success, frag_content = self._download_fragment(
ctx, frag_url, info_dict, headers)
if not success:
return False, frag_index
break
except compat_urllib_error.HTTPError as err:
# Unavailable (possibly temporary) fragments may be served.
# First we try to retry then either skip or abort.
# See https://github.com/ytdl-org/youtube-dl/issues/10165,
# https://github.com/ytdl-org/youtube-dl/issues/10448).
count += 1
if count <= fragment_retries:
self.report_retry_fragment(err, frag_index, count, fragment_retries)
if count > fragment_retries:
return False, frag_index
if decrypt_info['METHOD'] == 'AES-128':
iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence)
decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(
self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read()
# Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block
# size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded,
# not what it decrypts to.
if not test:
frag_content = AES.new(
decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
return frag_content, frag_index
def append_fragment(frag_content, frag_index):
if frag_content:
fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], frag_index)
try:
file, frag_sanitized = sanitize_open(fragment_filename, 'rb')
ctx['fragment_filename_sanitized'] = frag_sanitized
file.close()
self._append_fragment(ctx, frag_content)
return True
except FileNotFoundError:
if skip_unavailable_fragments:
self.report_skip_fragment(frag_index)
return True
else:
self.report_error(
'fragment %s not found, unable to continue' % frag_index)
return False
else:
if skip_unavailable_fragments:
self.report_skip_fragment(frag_index)
return True
else:
self.report_error(
'fragment %s not found, unable to continue' % frag_index)
return False
max_workers = self.params.get('concurrent_fragment_downloads', 1)
if can_threaded_download and max_workers > 1:
self.report_warning('The download speed shown is only of one thread. This is a known issue')
with concurrent.futures.ThreadPoolExecutor(max_workers) as pool:
futures = [pool.submit(download_fragment, fragment) for fragment in fragments]
# timeout must be 0 to return instantly
done, not_done = concurrent.futures.wait(futures, timeout=0)
try:
while not_done:
# Check every 1 second for KeyboardInterrupt
freshly_done, not_done = concurrent.futures.wait(not_done, timeout=1)
done |= freshly_done
except KeyboardInterrupt:
for future in not_done:
future.cancel()
# timeout must be none to cancel
concurrent.futures.wait(not_done, timeout=None)
raise KeyboardInterrupt
results = [future.result() for future in futures]
for frag_content, frag_index in results:
result = append_fragment(frag_content, frag_index)
if not result:
return False
else:
for fragment in fragments:
frag_content, frag_index = download_fragment(fragment)
result = append_fragment(frag_content, frag_index)
if not result:
return False
self._finish_frag_download(ctx) self._finish_frag_download(ctx)
return True return True

View File

@ -558,6 +558,10 @@ def parseOpts(overrideArguments=None):
help='Languages of the subtitles to download (optional) separated by commas, use --list-subs for available language tags') help='Languages of the subtitles to download (optional) separated by commas, use --list-subs for available language tags')
downloader = optparse.OptionGroup(parser, 'Download Options') downloader = optparse.OptionGroup(parser, 'Download Options')
downloader.add_option(
'-N', '--concurrent-fragments',
dest='concurrent_fragment_downloads', metavar='N', default=1, type=int,
help='Number of fragments to download concurrently (default is %default)')
downloader.add_option( downloader.add_option(
'-r', '--limit-rate', '--rate-limit', '-r', '--limit-rate', '--rate-limit',
dest='ratelimit', metavar='RATE', dest='ratelimit', metavar='RATE',