This commit is contained in:
Elyse 2024-05-06 02:18:09 +03:00 committed by GitHub
commit bd17239fea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 111 additions and 34 deletions

View File

@ -411,10 +411,15 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540) self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540)
self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140) self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140)
self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363) self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363)
self.assertEqual(unified_timestamp('2022-10-13T02:37:47.831Z'), 1665628667)
self.assertEqual(unified_timestamp('December 31 1969 20:00:01 EDT'), 1) self.assertEqual(unified_timestamp('December 31 1969 20:00:01 EDT'), 1)
self.assertEqual(unified_timestamp('Wednesday 31 December 1969 18:01:26 MDT'), 86) self.assertEqual(unified_timestamp('Wednesday 31 December 1969 18:01:26 MDT'), 86)
self.assertEqual(unified_timestamp('12/31/1969 20:01:18 EDT', False), 78) self.assertEqual(unified_timestamp('12/31/1969 20:01:18 EDT', False), 78)
self.assertEqual(unified_timestamp('2023-03-09T18:01:33.646Z', with_milliseconds=True), 1678384893.646)
# ISO8601 spec says that if no timezone is specified, we should use local timezone;
# but yt-dlp uses UTC to keep things consistent
self.assertEqual(unified_timestamp('2023-03-11T06:48:34.008'), 1678517314)
def test_determine_ext(self): def test_determine_ext(self):
self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')

View File

@ -27,7 +27,12 @@ from .cache import Cache
from .compat import functools, urllib # isort: split from .compat import functools, urllib # isort: split
from .compat import compat_os_name, urllib_req_to_req from .compat import compat_os_name, urllib_req_to_req
from .cookies import LenientSimpleCookie, load_cookies from .cookies import LenientSimpleCookie, load_cookies
from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader import (
DashSegmentsFD,
FFmpegFD,
get_suitable_downloader,
shorten_protocol_name,
)
from .downloader.rtmp import rtmpdump_version from .downloader.rtmp import rtmpdump_version
from .extractor import gen_extractor_classes, get_info_extractor from .extractor import gen_extractor_classes, get_info_extractor
from .extractor.common import UnsupportedURLIE from .extractor.common import UnsupportedURLIE
@ -3353,7 +3358,7 @@ class YoutubeDL:
fd, success = None, True fd, success = None, True
if info_dict.get('protocol') or info_dict.get('url'): if info_dict.get('protocol') or info_dict.get('url'):
fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
if fd != FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and ( if fd not in [FFmpegFD, DashSegmentsFD] and 'no-direct-merge' not in self.params['compat_opts'] and (
info_dict.get('section_start') or info_dict.get('section_end')): info_dict.get('section_start') or info_dict.get('section_end')):
msg = ('This format cannot be partially downloaded' if FFmpegFD.available() msg = ('This format cannot be partially downloaded' if FFmpegFD.available()
else 'You have requested downloading the video partially, but ffmpeg is not installed') else 'You have requested downloading the video partially, but ffmpeg is not installed')

View File

@ -12,6 +12,7 @@ import itertools
import optparse import optparse
import os import os
import re import re
import time
import traceback import traceback
from .compat import compat_os_name, compat_shlex_quote from .compat import compat_os_name, compat_shlex_quote
@ -331,12 +332,13 @@ def validate_options(opts):
(?P<end_sign>-?)(?P<end>[^-]+) (?P<end_sign>-?)(?P<end>[^-]+)
)?''' )?'''
current_time = time.time()
chapters, ranges, from_url = [], [], False chapters, ranges, from_url = [], [], False
for regex in value or []: for regex in value or []:
if advanced and regex == '*from-url': if advanced and regex == '*from-url':
from_url = True from_url = True
continue continue
elif not regex.startswith('*'): elif not regex.startswith('*') and not regex.startswith('#'):
try: try:
chapters.append(re.compile(regex)) chapters.append(re.compile(regex))
except re.error as err: except re.error as err:
@ -353,11 +355,16 @@ def validate_options(opts):
err = 'Must be of the form "*start-end"' err = 'Must be of the form "*start-end"'
elif not advanced and any(signs): elif not advanced and any(signs):
err = 'Negative timestamps are not allowed' err = 'Negative timestamps are not allowed'
else: elif regex.startswith('*'):
dur[0] *= -1 if signs[0] else 1 dur[0] *= -1 if signs[0] else 1
dur[1] *= -1 if signs[1] else 1 dur[1] *= -1 if signs[1] else 1
if dur[1] == float('-inf'): if dur[1] == float('-inf'):
err = '"-inf" is not a valid end' err = '"-inf" is not a valid end'
elif regex.startswith('#'):
dur[0] = dur[0] * (-1 if signs[0] else 1) + current_time
dur[1] = dur[1] * (-1 if signs[1] else 1) + current_time
if dur[1] == float('-inf'):
err = '"-inf" is not a valid end'
if err: if err:
raise ValueError(f'invalid {name} time range "{regex}". {err}') raise ValueError(f'invalid {name} time range "{regex}". {err}')
ranges.append(dur) ranges.append(dur)

View File

@ -36,6 +36,8 @@ class DashSegmentsFD(FragmentFD):
'filename': fmt.get('filepath') or filename, 'filename': fmt.get('filepath') or filename,
'live': 'is_from_start' if fmt.get('is_from_start') else fmt.get('is_live'), 'live': 'is_from_start' if fmt.get('is_from_start') else fmt.get('is_live'),
'total_frags': fragment_count, 'total_frags': fragment_count,
'section_start': info_dict.get('section_start'),
'section_end': info_dict.get('section_end'),
} }
if real_downloader: if real_downloader:

View File

@ -2695,7 +2695,7 @@ class InfoExtractor:
r = int(s.get('r', 0)) r = int(s.get('r', 0))
ms_info['total_number'] += 1 + r ms_info['total_number'] += 1 + r
ms_info['s'].append({ ms_info['s'].append({
't': int(s.get('t', 0)), 't': int_or_none(s.get('t')),
# @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
'd': int(s.attrib['d']), 'd': int(s.attrib['d']),
'r': r, 'r': r,
@ -2737,8 +2737,14 @@ class InfoExtractor:
return ms_info return ms_info
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
availability_start_time = unified_timestamp(
mpd_doc.get('availabilityStartTime'), with_milliseconds=True) or 0
stream_numbers = collections.defaultdict(int) stream_numbers = collections.defaultdict(int)
for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))): for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
# segmentIngestTime is completely out of spec, but YT Livestream do this
segment_ingest_time = period.get('{http://youtube.com/yt/2012/10/10}segmentIngestTime')
if segment_ingest_time:
availability_start_time = unified_timestamp(segment_ingest_time, with_milliseconds=True)
period_entry = { period_entry = {
'id': period.get('id', f'period-{period_idx}'), 'id': period.get('id', f'period-{period_idx}'),
'formats': [], 'formats': [],
@ -2917,13 +2923,17 @@ class InfoExtractor:
'Bandwidth': bandwidth, 'Bandwidth': bandwidth,
'Number': segment_number, 'Number': segment_number,
} }
duration = float_or_none(segment_d, representation_ms_info['timescale'])
start = float_or_none(segment_time, representation_ms_info['timescale'])
representation_ms_info['fragments'].append({ representation_ms_info['fragments'].append({
media_location_key: segment_url, media_location_key: segment_url,
'duration': float_or_none(segment_d, representation_ms_info['timescale']), 'duration': duration,
'start': availability_start_time + start,
'end': availability_start_time + start + duration,
}) })
for num, s in enumerate(representation_ms_info['s']): for num, s in enumerate(representation_ms_info['s']):
segment_time = s.get('t') or segment_time segment_time = s['t'] if s.get('t') is not None else segment_time
segment_d = s['d'] segment_d = s['d']
add_segment_url() add_segment_url()
segment_number += 1 segment_number += 1
@ -2939,6 +2949,7 @@ class InfoExtractor:
fragments = [] fragments = []
segment_index = 0 segment_index = 0
timescale = representation_ms_info['timescale'] timescale = representation_ms_info['timescale']
start = 0
for s in representation_ms_info['s']: for s in representation_ms_info['s']:
duration = float_or_none(s['d'], timescale) duration = float_or_none(s['d'], timescale)
for r in range(s.get('r', 0) + 1): for r in range(s.get('r', 0) + 1):
@ -2946,8 +2957,11 @@ class InfoExtractor:
fragments.append({ fragments.append({
location_key(segment_uri): segment_uri, location_key(segment_uri): segment_uri,
'duration': duration, 'duration': duration,
'start': availability_start_time + start,
'end': availability_start_time + start + duration,
}) })
segment_index += 1 segment_index += 1
start += duration
representation_ms_info['fragments'] = fragments representation_ms_info['fragments'] = fragments
elif 'segment_urls' in representation_ms_info: elif 'segment_urls' in representation_ms_info:
# Segment URLs with no SegmentTimeline # Segment URLs with no SegmentTimeline

View File

@ -2791,17 +2791,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
microformats = traverse_obj( microformats = traverse_obj(
prs, (..., 'microformat', 'playerMicroformatRenderer'), prs, (..., 'microformat', 'playerMicroformatRenderer'),
expected_type=dict) expected_type=dict)
_, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url) with lock:
is_live = live_status == 'is_live' _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
start_time = time.time() is_live = live_status == 'is_live'
start_time = time.time()
def mpd_feed(format_id, delay): def mpd_feed(format_id, delay):
""" """
@returns (manifest_url, manifest_stream_number, is_live) or None @returns (manifest_url, manifest_stream_number, is_live) or None
""" """
for retry in self.RetryManager(fatal=False): for retry in self.RetryManager(fatal=False):
with lock: refetch_manifest(format_id, delay)
refetch_manifest(format_id, delay)
f = next((f for f in formats if f['format_id'] == format_id), None) f = next((f for f in formats if f['format_id'] == format_id), None)
if not f: if not f:
@ -2832,6 +2832,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
begin_index = 0 begin_index = 0
download_start_time = ctx.get('start') or time.time() download_start_time = ctx.get('start') or time.time()
section_start = ctx.get('section_start') or 0
section_end = ctx.get('section_end') or math.inf
self.write_debug(f'Selected section: {section_start} -> {section_end}')
lack_early_segments = download_start_time - (live_start_time or download_start_time) > MAX_DURATION lack_early_segments = download_start_time - (live_start_time or download_start_time) > MAX_DURATION
if lack_early_segments: if lack_early_segments:
self.report_warning(bug_reports_message( self.report_warning(bug_reports_message(
@ -2852,9 +2857,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
or (mpd_url, stream_number, False)) or (mpd_url, stream_number, False))
if not refresh_sequence: if not refresh_sequence:
if expire_fast and not is_live: if expire_fast and not is_live:
return False, last_seq return False
elif old_mpd_url == mpd_url: elif old_mpd_url == mpd_url:
return True, last_seq return True
if manifestless_orig_fmt: if manifestless_orig_fmt:
fmt_info = manifestless_orig_fmt fmt_info = manifestless_orig_fmt
else: else:
@ -2865,14 +2871,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
fmts = None fmts = None
if not fmts: if not fmts:
no_fragment_score += 2 no_fragment_score += 2
return False, last_seq return False
fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number) fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number)
fragments = fmt_info['fragments'] fragments = fmt_info['fragments']
fragment_base_url = fmt_info['fragment_base_url'] fragment_base_url = fmt_info['fragment_base_url']
assert fragment_base_url assert fragment_base_url
_last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1)) return True
return True, _last_seq
self.write_debug(f'[{video_id}] Generating fragments for format {format_id}') self.write_debug(f'[{video_id}] Generating fragments for format {format_id}')
while is_live: while is_live:
@ -2892,11 +2897,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
last_segment_url = None last_segment_url = None
continue continue
else: else:
should_continue, last_seq = _extract_sequence_from_mpd(True, no_fragment_score > 15) should_continue = _extract_sequence_from_mpd(True, no_fragment_score > 15)
no_fragment_score += 2 no_fragment_score += 2
if not should_continue: if not should_continue:
continue continue
last_fragment = fragments[-1]
last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1))
known_fragment = next(
(fragment for fragment in fragments if f'sq/{known_idx}' in fragment['path']), None)
if known_fragment and known_fragment['end'] > section_end:
break
if known_idx > last_seq: if known_idx > last_seq:
last_segment_url = None last_segment_url = None
continue continue
@ -2906,20 +2919,36 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if begin_index < 0 and known_idx < 0: if begin_index < 0 and known_idx < 0:
# skip from the start when it's negative value # skip from the start when it's negative value
known_idx = last_seq + begin_index known_idx = last_seq + begin_index
if lack_early_segments: if lack_early_segments:
known_idx = max(known_idx, last_seq - int(MAX_DURATION // fragments[-1]['duration'])) known_idx = max(known_idx, last_seq - int(MAX_DURATION // last_fragment['duration']))
fragment_count = last_seq - known_idx if section_end == math.inf else int(
(section_end - section_start) // last_fragment['duration'])
try: try:
for idx in range(known_idx, last_seq): for idx in range(known_idx, last_seq):
# do not update sequence here or you'll get skipped some part of it # do not update sequence here or you'll get skipped some part of it
should_continue, _ = _extract_sequence_from_mpd(False, False) should_continue = _extract_sequence_from_mpd(False, False)
if not should_continue: if not should_continue:
known_idx = idx - 1 known_idx = idx - 1
raise ExtractorError('breaking out of outer loop') raise ExtractorError('breaking out of outer loop')
last_segment_url = urljoin(fragment_base_url, 'sq/%d' % idx)
yield { frag_duration = last_fragment['duration']
'url': last_segment_url, frag_start = last_fragment['start'] - (last_seq - idx) * frag_duration
'fragment_count': last_seq, frag_end = frag_start + frag_duration
}
if frag_start >= section_start and frag_end <= section_end:
last_segment_url = urljoin(fragment_base_url, f'sq/{idx}')
yield {
'url': last_segment_url,
'fragment_count': fragment_count,
'duration': frag_duration,
'start': frag_start,
'end': frag_end,
}
if known_idx == last_seq: if known_idx == last_seq:
no_fragment_score += 5 no_fragment_score += 5
else: else:
@ -3908,6 +3937,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE} dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE}
yield dct yield dct
if live_status == 'is_live' and self.get_param('download_ranges') and not self.get_param('live_from_start'):
self.report_warning('For YT livestreams, --download-sections is only supported with --live-from-start')
needs_live_processing = self._needs_live_processing(live_status, duration) needs_live_processing = self._needs_live_processing(live_status, duration)
skip_bad_formats = 'incomplete' not in format_types skip_bad_formats = 'incomplete' not in format_types
if self._configuration_arg('include_incomplete_formats'): if self._configuration_arg('include_incomplete_formats'):

View File

@ -419,7 +419,14 @@ def create_parser():
general.add_option( general.add_option(
'--live-from-start', '--live-from-start',
action='store_true', dest='live_from_start', action='store_true', dest='live_from_start',
help='Download livestreams from the start. Currently only supported for YouTube (Experimental)') help=('Download livestreams from the start. Currently only supported for YouTube (Experimental). '
'Time ranges can be specified using --download-sections to download only a part of the stream. '
'Negative values are allowed for specifying a relative previous time, using the # syntax '
'e.g. --download-sections "#-24hours - 0" (download last 24 hours), '
'e.g. --download-sections "#-1h - 30m" (download from 1 hour ago until the next 30 minutes), '
'e.g. --download-sections "#-3days - -2days" (download from 3 days ago until 2 days ago). '
'It is also possible to specify an exact unix timestamp range, using the * syntax, '
'e.g. --download-sections "*1672531200 - 1672549200" (download between those two timestamps)'))
general.add_option( general.add_option(
'--no-live-from-start', '--no-live-from-start',
action='store_false', dest='live_from_start', action='store_false', dest='live_from_start',

View File

@ -1209,7 +1209,7 @@ def unified_strdate(date_str, day_first=True):
return str(upload_date) return str(upload_date)
def unified_timestamp(date_str, day_first=True): def unified_timestamp(date_str, day_first=True, with_milliseconds=False):
if not isinstance(date_str, str): if not isinstance(date_str, str):
return None return None
@ -1235,7 +1235,7 @@ def unified_timestamp(date_str, day_first=True):
for expression in date_formats(day_first): for expression in date_formats(day_first):
with contextlib.suppress(ValueError): with contextlib.suppress(ValueError):
dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta) dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta)
return calendar.timegm(dt_.timetuple()) return calendar.timegm(dt_.timetuple()) + (dt_.microsecond / 1e6 if with_milliseconds else 0)
timetuple = email.utils.parsedate_tz(date_str) timetuple = email.utils.parsedate_tz(date_str)
if timetuple: if timetuple:
@ -2035,16 +2035,19 @@ def parse_duration(s):
days, hours, mins, secs, ms = [None] * 5 days, hours, mins, secs, ms = [None] * 5
m = re.match(r'''(?x) m = re.match(r'''(?x)
(?P<sign>[+-])?
(?P<before_secs> (?P<before_secs>
(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)? (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
(?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+)) (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
(?P<ms>[.:][0-9]+)?Z?$ (?P<ms>[.:][0-9]+)?Z?$
''', s) ''', s)
if m: if m:
days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms') sign, days, hours, mins, secs, ms = m.group('sign', 'days', 'hours', 'mins', 'secs', 'ms')
else: else:
m = re.match( m = re.match(
r'''(?ix)(?:P? r'''(?ix)(?:
(?P<sign>[+-])?
P?
(?: (?:
[0-9]+\s*y(?:ears?)?,?\s* [0-9]+\s*y(?:ears?)?,?\s*
)? )?
@ -2068,17 +2071,19 @@ def parse_duration(s):
(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s* (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
)?Z?$''', s) )?Z?$''', s)
if m: if m:
days, hours, mins, secs, ms = m.groups() sign, days, hours, mins, secs, ms = m.groups()
else: else:
m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s) m = re.match(r'(?i)(?P<sign>[+-])?(?:(?P<days>[0-9.]+)\s*(?:days?)|(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
if m: if m:
hours, mins = m.groups() sign, days, hours, mins = m.groups()
else: else:
return None return None
sign = -1 if sign == '-' else 1
if ms: if ms:
ms = ms.replace(':', '.') ms = ms.replace(':', '.')
return sum(float(part or 0) * mult for part, mult in ( return sign * sum(float(part or 0) * mult for part, mult in (
(days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1))) (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))