Merge cf96b24de6 into 5904853ae5

2024-06-10 10:50:49 +00:00 · 2024-05-06 02:18:09 +03:00 · 2024-05-06 02:18:09 +03:00 · bd17239fea
commit bd17239fea
parent 5904853ae5 cf96b24de6
8 changed files with 111 additions and 34 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -411,10 +411,15 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540)
        self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140)
        self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363)
        self.assertEqual(unified_timestamp('2022-10-13T02:37:47.831Z'), 1665628667)
        self.assertEqual(unified_timestamp('December 31 1969 20:00:01 EDT'), 1)
        self.assertEqual(unified_timestamp('Wednesday 31 December 1969 18:01:26 MDT'), 86)
        self.assertEqual(unified_timestamp('12/31/1969 20:01:18 EDT', False), 78)
        self.assertEqual(unified_timestamp('2023-03-09T18:01:33.646Z', with_milliseconds=True), 1678384893.646)
        # ISO8601 spec says that if no timezone is specified, we should use local timezone;
        # but yt-dlp uses UTC to keep things consistent
        self.assertEqual(unified_timestamp('2023-03-11T06:48:34.008'), 1678517314)
    def test_determine_ext(self):
        self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -27,7 +27,12 @@ from .cache import Cache
 from .compat import functools, urllib  # isort: split
 from .compat import compat_os_name, urllib_req_to_req
 from .cookies import LenientSimpleCookie, load_cookies
-from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
+from .downloader import (
    DashSegmentsFD,
    FFmpegFD,
    get_suitable_downloader,
    shorten_protocol_name,
 )
 from .downloader.rtmp import rtmpdump_version
 from .extractor import gen_extractor_classes, get_info_extractor
 from .extractor.common import UnsupportedURLIE
@ -3353,7 +3358,7 @@ class YoutubeDL:
                fd, success = None, True
                if info_dict.get('protocol') or info_dict.get('url'):
                    fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-')
-                    if fd != FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and (
+                    if fd not in [FFmpegFD, DashSegmentsFD] and 'no-direct-merge' not in self.params['compat_opts'] and (
                            info_dict.get('section_start') or info_dict.get('section_end')):
                        msg = ('This format cannot be partially downloaded' if FFmpegFD.available()
                               else 'You have requested downloading the video partially, but ffmpeg is not installed')
--- a/yt_dlp/init.py
+++ b/yt_dlp/init.py
@ -12,6 +12,7 @@ import itertools
 import optparse
 import os
 import re
 import time
 import traceback
 from .compat import compat_os_name, compat_shlex_quote
@ -331,12 +332,13 @@ def validate_options(opts):
            (?P<end_sign>-?)(?P<end>[^-]+)
        )?'''
        current_time = time.time()
        chapters, ranges, from_url = [], [], False
        for regex in value or []:
            if advanced and regex == '*from-url':
                from_url = True
                continue
-            elif not regex.startswith('*'):
+            elif not regex.startswith('*') and not regex.startswith('#'):
                try:
                    chapters.append(re.compile(regex))
                except re.error as err:
@ -353,11 +355,16 @@ def validate_options(opts):
                    err = 'Must be of the form "*start-end"'
                elif not advanced and any(signs):
                    err = 'Negative timestamps are not allowed'
-                else:
+                elif regex.startswith('*'):
                    dur[0] *= -1 if signs[0] else 1
                    dur[1] *= -1 if signs[1] else 1
                    if dur[1] == float('-inf'):
                        err = '"-inf" is not a valid end'
                elif regex.startswith('#'):
                    dur[0] = dur[0] * (-1 if signs[0] else 1) + current_time
                    dur[1] = dur[1] * (-1 if signs[1] else 1) + current_time
                    if dur[1] == float('-inf'):
                        err = '"-inf" is not a valid end'
                if err:
                    raise ValueError(f'invalid {name} time range "{regex}". {err}')
                ranges.append(dur)
--- a/yt_dlp/downloader/dash.py
+++ b/yt_dlp/downloader/dash.py
@ -36,6 +36,8 @@ class DashSegmentsFD(FragmentFD):
                'filename': fmt.get('filepath') or filename,
                'live': 'is_from_start' if fmt.get('is_from_start') else fmt.get('is_live'),
                'total_frags': fragment_count,
                'section_start': info_dict.get('section_start'),
                'section_end': info_dict.get('section_end'),
            }
            if real_downloader:
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -2695,7 +2695,7 @@ class InfoExtractor:
                            r = int(s.get('r', 0))
                            ms_info['total_number'] += 1 + r
                            ms_info['s'].append({
-                                't': int(s.get('t', 0)),
+                                't': int_or_none(s.get('t')),
                                # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
                                'd': int(s.attrib['d']),
                                'r': r,
@ -2737,8 +2737,14 @@ class InfoExtractor:
            return ms_info
        mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
        availability_start_time = unified_timestamp(
            mpd_doc.get('availabilityStartTime'), with_milliseconds=True) or 0
        stream_numbers = collections.defaultdict(int)
        for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
            # segmentIngestTime is completely out of spec, but YT Livestream do this
            segment_ingest_time = period.get('{http://youtube.com/yt/2012/10/10}segmentIngestTime')
            if segment_ingest_time:
                availability_start_time = unified_timestamp(segment_ingest_time, with_milliseconds=True)
            period_entry = {
                'id': period.get('id', f'period-{period_idx}'),
                'formats': [],
@ -2917,13 +2923,17 @@ class InfoExtractor:
                                    'Bandwidth': bandwidth,
                                    'Number': segment_number,
                                }
                                duration = float_or_none(segment_d, representation_ms_info['timescale'])
                                start = float_or_none(segment_time, representation_ms_info['timescale'])
                                representation_ms_info['fragments'].append({
                                    media_location_key: segment_url,
-                                    'duration': float_or_none(segment_d, representation_ms_info['timescale']),
+                                    'duration': duration,
                                    'start': availability_start_time + start,
                                    'end': availability_start_time + start + duration,
                                })
                            for num, s in enumerate(representation_ms_info['s']):
-                                segment_time = s.get('t') or segment_time
+                                segment_time = s['t'] if s.get('t') is not None else segment_time
                                segment_d = s['d']
                                add_segment_url()
                                segment_number += 1
@ -2939,6 +2949,7 @@ class InfoExtractor:
                        fragments = []
                        segment_index = 0
                        timescale = representation_ms_info['timescale']
                        start = 0
                        for s in representation_ms_info['s']:
                            duration = float_or_none(s['d'], timescale)
                            for r in range(s.get('r', 0) + 1):
@ -2946,8 +2957,11 @@ class InfoExtractor:
                                fragments.append({
                                    location_key(segment_uri): segment_uri,
                                    'duration': duration,
                                    'start': availability_start_time + start,
                                    'end': availability_start_time + start + duration,
                                })
                                segment_index += 1
                                start += duration
                        representation_ms_info['fragments'] = fragments
                    elif 'segment_urls' in representation_ms_info:
                        # Segment URLs with no SegmentTimeline
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -2791,17 +2791,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            microformats = traverse_obj(
                prs, (..., 'microformat', 'playerMicroformatRenderer'),
                expected_type=dict)
-            _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
+            with lock:
-            is_live = live_status == 'is_live'
+                _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
-            start_time = time.time()
+                is_live = live_status == 'is_live'
                start_time = time.time()
        def mpd_feed(format_id, delay):
            """
            @returns (manifest_url, manifest_stream_number, is_live) or None
            """
            for retry in self.RetryManager(fatal=False):
-                with lock:
+                refetch_manifest(format_id, delay)
                    refetch_manifest(format_id, delay)
                f = next((f for f in formats if f['format_id'] == format_id), None)
                if not f:
@ -2832,6 +2832,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        begin_index = 0
        download_start_time = ctx.get('start') or time.time()
        section_start = ctx.get('section_start') or 0
        section_end = ctx.get('section_end') or math.inf
        self.write_debug(f'Selected section: {section_start} -> {section_end}')
        lack_early_segments = download_start_time - (live_start_time or download_start_time) > MAX_DURATION
        if lack_early_segments:
            self.report_warning(bug_reports_message(
@ -2852,9 +2857,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                                               or (mpd_url, stream_number, False))
            if not refresh_sequence:
                if expire_fast and not is_live:
-                    return False, last_seq
+                    return False
                elif old_mpd_url == mpd_url:
-                    return True, last_seq
+                    return True
            if manifestless_orig_fmt:
                fmt_info = manifestless_orig_fmt
            else:
@ -2865,14 +2871,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    fmts = None
                if not fmts:
                    no_fragment_score += 2
-                    return False, last_seq
+                    return False
                fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number)
            fragments = fmt_info['fragments']
            fragment_base_url = fmt_info['fragment_base_url']
            assert fragment_base_url
-            _last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1))
+            return True
            return True, _last_seq
        self.write_debug(f'[{video_id}] Generating fragments for format {format_id}')
        while is_live:
@ -2892,11 +2897,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    last_segment_url = None
                    continue
            else:
-                should_continue, last_seq = _extract_sequence_from_mpd(True, no_fragment_score > 15)
+                should_continue = _extract_sequence_from_mpd(True, no_fragment_score > 15)
                no_fragment_score += 2
                if not should_continue:
                    continue
            last_fragment = fragments[-1]
            last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1))
            known_fragment = next(
                (fragment for fragment in fragments if f'sq/{known_idx}' in fragment['path']), None)
            if known_fragment and known_fragment['end'] > section_end:
                break
            if known_idx > last_seq:
                last_segment_url = None
                continue
@ -2906,20 +2919,36 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            if begin_index < 0 and known_idx < 0:
                # skip from the start when it's negative value
                known_idx = last_seq + begin_index
            if lack_early_segments:
-                known_idx = max(known_idx, last_seq - int(MAX_DURATION // fragments[-1]['duration']))
+                known_idx = max(known_idx, last_seq - int(MAX_DURATION // last_fragment['duration']))
            fragment_count = last_seq - known_idx if section_end == math.inf else int(
                (section_end - section_start) // last_fragment['duration'])
            try:
                for idx in range(known_idx, last_seq):
                    # do not update sequence here or you'll get skipped some part of it
-                    should_continue, _ = _extract_sequence_from_mpd(False, False)
+                    should_continue = _extract_sequence_from_mpd(False, False)
                    if not should_continue:
                        known_idx = idx - 1
                        raise ExtractorError('breaking out of outer loop')
-                    last_segment_url = urljoin(fragment_base_url, 'sq/%d' % idx)
+
-                    yield {
+                    frag_duration = last_fragment['duration']
-                        'url': last_segment_url,
+                    frag_start = last_fragment['start'] - (last_seq - idx) * frag_duration
-                        'fragment_count': last_seq,
+                    frag_end = frag_start + frag_duration
-                    }
+
                    if frag_start >= section_start and frag_end <= section_end:
                        last_segment_url = urljoin(fragment_base_url, f'sq/{idx}')
                        yield {
                            'url': last_segment_url,
                            'fragment_count': fragment_count,
                            'duration': frag_duration,
                            'start': frag_start,
                            'end': frag_end,
                        }
                if known_idx == last_seq:
                    no_fragment_score += 5
                else:
@ -3908,6 +3937,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE}
                yield dct
        if live_status == 'is_live' and self.get_param('download_ranges') and not self.get_param('live_from_start'):
            self.report_warning('For YT livestreams, --download-sections is only supported with --live-from-start')
        needs_live_processing = self._needs_live_processing(live_status, duration)
        skip_bad_formats = 'incomplete' not in format_types
        if self._configuration_arg('include_incomplete_formats'):
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@ -419,7 +419,14 @@ def create_parser():
    general.add_option(
        '--live-from-start',
        action='store_true', dest='live_from_start',
-        help='Download livestreams from the start. Currently only supported for YouTube (Experimental)')
+        help=('Download livestreams from the start. Currently only supported for YouTube (Experimental). '
              'Time ranges can be specified using --download-sections to download only a part of the stream. '
              'Negative values are allowed for specifying a relative previous time, using the # syntax '
              'e.g. --download-sections "#-24hours - 0" (download last 24 hours), '
              'e.g. --download-sections "#-1h - 30m" (download from 1 hour ago until the next 30 minutes), '
              'e.g. --download-sections "#-3days - -2days" (download from 3 days ago until 2 days ago). '
              'It is also possible to specify an exact unix timestamp range, using the * syntax, '
              'e.g. --download-sections "*1672531200 - 1672549200" (download between those two timestamps)'))
    general.add_option(
        '--no-live-from-start',
        action='store_false', dest='live_from_start',
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@ -1209,7 +1209,7 @@ def unified_strdate(date_str, day_first=True):
        return str(upload_date)
-def unified_timestamp(date_str, day_first=True):
+def unified_timestamp(date_str, day_first=True, with_milliseconds=False):
    if not isinstance(date_str, str):
        return None
@ -1235,7 +1235,7 @@ def unified_timestamp(date_str, day_first=True):
    for expression in date_formats(day_first):
        with contextlib.suppress(ValueError):
            dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta)
-            return calendar.timegm(dt_.timetuple())
+            return calendar.timegm(dt_.timetuple()) + (dt_.microsecond / 1e6 if with_milliseconds else 0)
    timetuple = email.utils.parsedate_tz(date_str)
    if timetuple:
@ -2035,16 +2035,19 @@ def parse_duration(s):
    days, hours, mins, secs, ms = [None] * 5
    m = re.match(r'''(?x)
            (?P<sign>[+-])?
            (?P<before_secs>
                (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
            (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
            (?P<ms>[.:][0-9]+)?Z?$
        ''', s)
    if m:
-        days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
+        sign, days, hours, mins, secs, ms = m.group('sign', 'days', 'hours', 'mins', 'secs', 'ms')
    else:
        m = re.match(
-            r'''(?ix)(?:P?
+            r'''(?ix)(?:
                (?P<sign>[+-])?
                P?
                (?:
                    [0-9]+\s*y(?:ears?)?,?\s*
                )?
@ -2068,17 +2071,19 @@ def parse_duration(s):
                    (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
                )?Z?$''', s)
        if m:
-            days, hours, mins, secs, ms = m.groups()
+            sign, days, hours, mins, secs, ms = m.groups()
        else:
-            m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
+            m = re.match(r'(?i)(?P<sign>[+-])?(?:(?P<days>[0-9.]+)\s*(?:days?)|(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
            if m:
-                hours, mins = m.groups()
+                sign, days, hours, mins = m.groups()
            else:
                return None
    sign = -1 if sign == '-' else 1
    if ms:
        ms = ms.replace(':', '.')
-    return sum(float(part or 0) * mult for part, mult in (
+    return sign * sum(float(part or 0) * mult for part, mult in (
        (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))