From 2fbd6de957bfd13bd0873ee865a7e693534fd3a4 Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Thu, 9 Mar 2023 11:30:40 -0600 Subject: [PATCH 01/45] [utils] Add hackish 'now' support for --download-sections --- yt_dlp/__init__.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 9ef31601c..b52aa1e9e 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -42,6 +42,7 @@ from .utils import ( GeoUtils, PlaylistEntries, SameFileError, + datetime_from_str, decodeOption, download_range_func, expand_path, @@ -320,12 +321,23 @@ def validate_options(opts): del opts.outtmpl['default'] def parse_chapters(name, value): + def parse_timestamp(x): + # FIXME: This should be smarter, e.g. 'inf-1day'? + x = x.replace('(', '').replace(')', '') + + if x in ('inf', 'infinite'): + return float('inf') + + if re.match(r'[\d:]+', x): + return parse_duration(x) + + return datetime_from_str(x, precision='second').timestamp() + chapters, ranges = [], [] - parse_timestamp = lambda x: float('inf') if x in ('inf', 'infinite') else parse_duration(x) for regex in value or []: if regex.startswith('*'): for range_ in map(str.strip, regex[1:].split(',')): - mobj = range_ != '-' and re.fullmatch(r'([^-]+)?\s*-\s*([^-]+)?', range_) + mobj = range_ != '-' and re.fullmatch(r'(.+)?\s*-\s*(.+)?', range_) dur = mobj and (parse_timestamp(mobj.group(1) or '0'), parse_timestamp(mobj.group(2) or 'inf')) if None in (dur or [None]): raise ValueError(f'invalid {name} time range "{regex}". Must be of the form "*start-end"') From 439be2b4a443bb7edd89f9037d509adb8067f954 Mon Sep 17 00:00:00 2001 From: Sophire <115919609+sophie0x@users.noreply.github.com> Date: Thu, 20 Oct 2022 19:50:58 -0500 Subject: [PATCH 02/45] [utils] Add microseconds to unified_timestamp --- test/test_utils.py | 2 ++ yt_dlp/utils.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 3045b6d7e..fceca03ec 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -415,10 +415,12 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540) self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140) self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363) + self.assertEqual(unified_timestamp('2022-10-13T02:37:47.831Z'), 1665628667) self.assertEqual(unified_timestamp('December 31 1969 20:00:01 EDT'), 1) self.assertEqual(unified_timestamp('Wednesday 31 December 1969 18:01:26 MDT'), 86) self.assertEqual(unified_timestamp('12/31/1969 20:01:18 EDT', False), 78) + self.assertEqual(unified_timestamp('2023-03-09T18:01:33.646Z', with_milliseconds=True), 1678384893.646) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 8c2c5593c..a2e2c8fc6 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -1843,7 +1843,7 @@ def unified_strdate(date_str, day_first=True): return str(upload_date) -def unified_timestamp(date_str, day_first=True): +def unified_timestamp(date_str, day_first=True, with_milliseconds=False): if date_str is None: return None @@ -1869,7 +1869,7 @@ def unified_timestamp(date_str, day_first=True): for expression in date_formats(day_first): with contextlib.suppress(ValueError): dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) - return calendar.timegm(dt.timetuple()) + return calendar.timegm(dt.timetuple()) + (dt.microsecond/1e6 if with_milliseconds else 0) timetuple = email.utils.parsedate_tz(date_str) if timetuple: From 367429e23879ae127b82e7e8cacd62b878033e75 Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Thu, 9 Mar 2023 12:08:20 -0600 Subject: [PATCH 03/45] [common] Extract start and end keys for Dash fragments --- yt_dlp/extractor/common.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 2091df7fa..815538248 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2566,7 +2566,7 @@ class InfoExtractor: r = int(s.get('r', 0)) ms_info['total_number'] += 1 + r ms_info['s'].append({ - 't': int(s.get('t', 0)), + 't': int_or_none(s.get('t')), # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) 'd': int(s.attrib['d']), 'r': r, @@ -2608,9 +2608,16 @@ class InfoExtractor: return ms_info mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) + availability_start_time = unified_timestamp( + mpd_doc.get('availabilityStartTime'), with_milliseconds=True) or 0 formats, subtitles = [], {} stream_numbers = collections.defaultdict(int) for period in mpd_doc.findall(_add_ns('Period')): + # segmentIngestTime is completely out of spec, but YT Livestream do this + segment_ingest_time = period.get('{http://youtube.com/yt/2012/10/10}segmentIngestTime') + if segment_ingest_time: + availability_start_time = unified_timestamp(segment_ingest_time, with_milliseconds=True) + period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { 'start_number': 1, @@ -2784,13 +2791,17 @@ class InfoExtractor: 'Bandwidth': bandwidth, 'Number': segment_number, } + duration = float_or_none(segment_d, representation_ms_info['timescale']) + start = float_or_none(segment_time, representation_ms_info['timescale']) representation_ms_info['fragments'].append({ media_location_key: segment_url, - 'duration': float_or_none(segment_d, representation_ms_info['timescale']), + 'duration': duration, + 'start': availability_start_time + start, + 'end': availability_start_time + start + duration, }) for num, s in enumerate(representation_ms_info['s']): - segment_time = s.get('t') or segment_time + segment_time = s['t'] if s.get('t') is not None else segment_time segment_d = s['d'] add_segment_url() segment_number += 1 @@ -2806,6 +2817,7 @@ class InfoExtractor: fragments = [] segment_index = 0 timescale = representation_ms_info['timescale'] + start = 0 for s in representation_ms_info['s']: duration = float_or_none(s['d'], timescale) for r in range(s.get('r', 0) + 1): @@ -2813,8 +2825,11 @@ class InfoExtractor: fragments.append({ location_key(segment_uri): segment_uri, 'duration': duration, + 'start': availability_start_time + start, + 'end': availability_start_time + start + duration, }) segment_index += 1 + start += duration representation_ms_info['fragments'] = fragments elif 'segment_urls' in representation_ms_info: # Segment URLs with no SegmentTimeline From 1799a6ae364c13286e3195ae03a39e7aa82334bf Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Thu, 9 Mar 2023 17:18:44 -0600 Subject: [PATCH 04/45] [utils] Allow using local timezone for 'now' timestamps --- yt_dlp/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index a2e2c8fc6..7dfc1d6dc 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -1893,7 +1893,7 @@ def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None): return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext) -def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): +def datetime_from_str(date_str, precision='auto', format='%Y%m%d', use_utc=True): R""" Return a datetime object from a string. Supported format: @@ -1902,12 +1902,13 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): @param format strftime format of DATE @param precision Round the datetime object: auto|microsecond|second|minute|hour|day auto: round to the unit provided in date_str (if applicable). + @param use_utc Use UTC instead of local timezone for 'now' timestamps. """ auto_precision = False if precision == 'auto': auto_precision = True precision = 'microsecond' - today = datetime_round(datetime.datetime.utcnow(), precision) + today = datetime_round(datetime.datetime.utcnow() if use_utc else datetime.datetime.now(), precision) if date_str in ('now', 'today'): return today if date_str == 'yesterday': @@ -1916,7 +1917,7 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): r'(?P.+)(?P[+-])(?P