[extractor] Detect sttp as subtitles in MPD

Closes #656
Solution by: fstirlitz
This commit is contained in:
pukkandan 2021-08-10 04:42:03 +05:30
parent 7be9ccff0b
commit be2fc5b212
No known key found for this signature in database
GPG Key ID: 0F00D95A001F4698

View File

@ -2596,215 +2596,223 @@ class InfoExtractor(object):
mime_type = representation_attrib['mimeType'] mime_type = representation_attrib['mimeType']
content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
if content_type in ('video', 'audio', 'text') or mime_type == 'image/jpeg': codecs = representation_attrib.get('codecs', '')
base_url = '' if content_type not in ('video', 'audio', 'text'):
for element in (representation, adaptation_set, period, mpd_doc): if mime_type == 'image/jpeg':
base_url_e = element.find(_add_ns('BaseURL')) content_type = 'image/jpeg'
if base_url_e is not None: if codecs.split('.')[0] == 'stpp':
base_url = base_url_e.text + base_url content_type = 'text'
if re.match(r'^https?://', base_url):
break
if mpd_base_url and not re.match(r'^https?://', base_url):
if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
mpd_base_url += '/'
base_url = mpd_base_url + base_url
representation_id = representation_attrib.get('id')
lang = representation_attrib.get('lang')
url_el = representation.find(_add_ns('BaseURL'))
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
if representation_id is not None:
format_id = representation_id
else: else:
format_id = content_type self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
if mpd_id: continue
format_id = mpd_id + '-' + format_id
if content_type in ('video', 'audio'):
f = {
'format_id': format_id,
'manifest_url': mpd_url,
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
'height': int_or_none(representation_attrib.get('height')),
'tbr': float_or_none(bandwidth, 1000),
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
'fps': int_or_none(representation_attrib.get('frameRate')),
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
'format_note': 'DASH %s' % content_type,
'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash',
}
f.update(parse_codecs(representation_attrib.get('codecs')))
elif content_type == 'text':
f = {
'ext': mimetype2ext(mime_type),
'manifest_url': mpd_url,
'filesize': filesize,
}
elif mime_type == 'image/jpeg':
# See test case in VikiIE
# https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
f = {
'format_id': format_id,
'ext': 'mhtml',
'manifest_url': mpd_url,
'format_note': 'DASH storyboards (jpeg)',
'acodec': 'none',
'vcodec': 'none',
}
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
def prepare_template(template_name, identifiers): base_url = ''
tmpl = representation_ms_info[template_name] for element in (representation, adaptation_set, period, mpd_doc):
# First of, % characters outside $...$ templates base_url_e = element.find(_add_ns('BaseURL'))
# must be escaped by doubling for proper processing if base_url_e is not None:
# by % operator string formatting used further (see base_url = base_url_e.text + base_url
# https://github.com/ytdl-org/youtube-dl/issues/16867). if re.match(r'^https?://', base_url):
t = '' break
in_template = False if mpd_base_url and not re.match(r'^https?://', base_url):
for c in tmpl: if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
mpd_base_url += '/'
base_url = mpd_base_url + base_url
representation_id = representation_attrib.get('id')
lang = representation_attrib.get('lang')
url_el = representation.find(_add_ns('BaseURL'))
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
if representation_id is not None:
format_id = representation_id
else:
format_id = content_type
if mpd_id:
format_id = mpd_id + '-' + format_id
if content_type in ('video', 'audio'):
f = {
'format_id': format_id,
'manifest_url': mpd_url,
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
'height': int_or_none(representation_attrib.get('height')),
'tbr': float_or_none(bandwidth, 1000),
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
'fps': int_or_none(representation_attrib.get('frameRate')),
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
'format_note': 'DASH %s' % content_type,
'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash',
}
f.update(parse_codecs(codecs))
elif content_type == 'text':
f = {
'ext': mimetype2ext(mime_type),
'manifest_url': mpd_url,
'filesize': filesize,
}
elif content_type == 'image/jpeg':
# See test case in VikiIE
# https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
f = {
'format_id': format_id,
'ext': 'mhtml',
'manifest_url': mpd_url,
'format_note': 'DASH storyboards (jpeg)',
'acodec': 'none',
'vcodec': 'none',
}
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
def prepare_template(template_name, identifiers):
tmpl = representation_ms_info[template_name]
# First of, % characters outside $...$ templates
# must be escaped by doubling for proper processing
# by % operator string formatting used further (see
# https://github.com/ytdl-org/youtube-dl/issues/16867).
t = ''
in_template = False
for c in tmpl:
t += c
if c == '$':
in_template = not in_template
elif c == '%' and not in_template:
t += c t += c
if c == '$': # Next, $...$ templates are translated to their
in_template = not in_template # %(...) counterparts to be used with % operator
elif c == '%' and not in_template: if representation_id is not None:
t += c t = t.replace('$RepresentationID$', representation_id)
# Next, $...$ templates are translated to their t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
# %(...) counterparts to be used with % operator t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
if representation_id is not None: t.replace('$$', '$')
t = t.replace('$RepresentationID$', representation_id) return t
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
t.replace('$$', '$')
return t
# @initialization is a regular template like @media one # @initialization is a regular template like @media one
# so it should be handled just the same way (see # so it should be handled just the same way (see
# https://github.com/ytdl-org/youtube-dl/issues/11605) # https://github.com/ytdl-org/youtube-dl/issues/11605)
if 'initialization' in representation_ms_info: if 'initialization' in representation_ms_info:
initialization_template = prepare_template( initialization_template = prepare_template(
'initialization', 'initialization',
# As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
# $Time$ shall not be included for @initialization thus # $Time$ shall not be included for @initialization thus
# only $Bandwidth$ remains # only $Bandwidth$ remains
('Bandwidth', )) ('Bandwidth', ))
representation_ms_info['initialization_url'] = initialization_template % { representation_ms_info['initialization_url'] = initialization_template % {
'Bandwidth': bandwidth, 'Bandwidth': bandwidth,
} }
def location_key(location): def location_key(location):
return 'url' if re.match(r'^https?://', location) else 'path' return 'url' if re.match(r'^https?://', location) else 'path'
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
media_location_key = location_key(media_template) media_location_key = location_key(media_template)
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time # can't be used at the same time
if '%(Number' in media_template and 's' not in representation_ms_info: if '%(Number' in media_template and 's' not in representation_ms_info:
segment_duration = None segment_duration = None
if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
representation_ms_info['fragments'] = [{ representation_ms_info['fragments'] = [{
media_location_key: media_template % { media_location_key: media_template % {
'Number': segment_number, 'Number': segment_number,
'Bandwidth': bandwidth, 'Bandwidth': bandwidth,
}, },
'duration': segment_duration, 'duration': segment_duration,
} for segment_number in range( } for segment_number in range(
representation_ms_info['start_number'], representation_ms_info['start_number'],
representation_ms_info['total_number'] + representation_ms_info['start_number'])] representation_ms_info['total_number'] + representation_ms_info['start_number'])]
else: else:
# $Number*$ or $Time$ in media template with S list available # $Number*$ or $Time$ in media template with S list available
# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
representation_ms_info['fragments'] = [] representation_ms_info['fragments'] = []
segment_time = 0 segment_time = 0
segment_d = None segment_d = None
segment_number = representation_ms_info['start_number'] segment_number = representation_ms_info['start_number']
def add_segment_url(): def add_segment_url():
segment_url = media_template % { segment_url = media_template % {
'Time': segment_time, 'Time': segment_time,
'Bandwidth': bandwidth, 'Bandwidth': bandwidth,
'Number': segment_number, 'Number': segment_number,
} }
representation_ms_info['fragments'].append({ representation_ms_info['fragments'].append({
media_location_key: segment_url, media_location_key: segment_url,
'duration': float_or_none(segment_d, representation_ms_info['timescale']), 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
}) })
for num, s in enumerate(representation_ms_info['s']): for num, s in enumerate(representation_ms_info['s']):
segment_time = s.get('t') or segment_time segment_time = s.get('t') or segment_time
segment_d = s['d'] segment_d = s['d']
add_segment_url()
segment_number += 1
for r in range(s.get('r', 0)):
segment_time += segment_d
add_segment_url() add_segment_url()
segment_number += 1 segment_number += 1
for r in range(s.get('r', 0)): segment_time += segment_d
segment_time += segment_d elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
add_segment_url() # No media template
segment_number += 1 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
segment_time += segment_d # or any YouTube dashsegments video
elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: fragments = []
# No media template segment_index = 0
# Example: https://www.youtube.com/watch?v=iXZV5uAYMJI timescale = representation_ms_info['timescale']
# or any YouTube dashsegments video for s in representation_ms_info['s']:
fragments = [] duration = float_or_none(s['d'], timescale)
segment_index = 0 for r in range(s.get('r', 0) + 1):
timescale = representation_ms_info['timescale'] segment_uri = representation_ms_info['segment_urls'][segment_index]
for s in representation_ms_info['s']: fragments.append({
duration = float_or_none(s['d'], timescale) location_key(segment_uri): segment_uri,
for r in range(s.get('r', 0) + 1): 'duration': duration,
segment_uri = representation_ms_info['segment_urls'][segment_index] })
fragments.append({ segment_index += 1
location_key(segment_uri): segment_uri, representation_ms_info['fragments'] = fragments
'duration': duration, elif 'segment_urls' in representation_ms_info:
}) # Segment URLs with no SegmentTimeline
segment_index += 1 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
representation_ms_info['fragments'] = fragments # https://github.com/ytdl-org/youtube-dl/pull/14844
elif 'segment_urls' in representation_ms_info: fragments = []
# Segment URLs with no SegmentTimeline segment_duration = float_or_none(
# Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 representation_ms_info['segment_duration'],
# https://github.com/ytdl-org/youtube-dl/pull/14844 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
fragments = [] for segment_url in representation_ms_info['segment_urls']:
segment_duration = float_or_none( fragment = {
representation_ms_info['segment_duration'], location_key(segment_url): segment_url,
representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None }
for segment_url in representation_ms_info['segment_urls']: if segment_duration:
fragment = { fragment['duration'] = segment_duration
location_key(segment_url): segment_url, fragments.append(fragment)
} representation_ms_info['fragments'] = fragments
if segment_duration: # If there is a fragments key available then we correctly recognized fragmented media.
fragment['duration'] = segment_duration # Otherwise we will assume unfragmented media with direct access. Technically, such
fragments.append(fragment) # assumption is not necessarily correct since we may simply have no support for
representation_ms_info['fragments'] = fragments # some forms of fragmented media renditions yet, but for now we'll use this fallback.
# If there is a fragments key available then we correctly recognized fragmented media. if 'fragments' in representation_ms_info:
# Otherwise we will assume unfragmented media with direct access. Technically, such f.update({
# assumption is not necessarily correct since we may simply have no support for # NB: mpd_url may be empty when MPD manifest is parsed from a string
# some forms of fragmented media renditions yet, but for now we'll use this fallback. 'url': mpd_url or base_url,
if 'fragments' in representation_ms_info: 'fragment_base_url': base_url,
f.update({ 'fragments': [],
# NB: mpd_url may be empty when MPD manifest is parsed from a string 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
'url': mpd_url or base_url, })
'fragment_base_url': base_url, if 'initialization_url' in representation_ms_info:
'fragments': [], initialization_url = representation_ms_info['initialization_url']
'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml', if not f.get('url'):
}) f['url'] = initialization_url
if 'initialization_url' in representation_ms_info: f['fragments'].append({location_key(initialization_url): initialization_url})
initialization_url = representation_ms_info['initialization_url'] f['fragments'].extend(representation_ms_info['fragments'])
if not f.get('url'):
f['url'] = initialization_url
f['fragments'].append({location_key(initialization_url): initialization_url})
f['fragments'].extend(representation_ms_info['fragments'])
else:
# Assuming direct URL to unfragmented media.
f['url'] = base_url
if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
formats.append(f)
elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f)
else: else:
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) # Assuming direct URL to unfragmented media.
f['url'] = base_url
if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
formats.append(f)
elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f)
return formats, subtitles return formats, subtitles
def _extract_ism_formats(self, *args, **kwargs): def _extract_ism_formats(self, *args, **kwargs):