[Ted] Rewrite extractor (#2359)

Closes #2343
Authored by: pukkandan, trassshhub
This commit is contained in:
trasssh 2022-01-20 00:04:20 +08:00 committed by GitHub
parent dfb7f2a25d
commit 4259402c56
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 194 additions and 314 deletions

View File

@ -13,7 +13,7 @@ from test.helper import FakeYDL, md5, is_download_test
from yt_dlp.extractor import ( from yt_dlp.extractor import (
YoutubeIE, YoutubeIE,
DailymotionIE, DailymotionIE,
TEDIE, TedTalkIE,
VimeoIE, VimeoIE,
WallaIE, WallaIE,
CeskaTelevizeIE, CeskaTelevizeIE,
@ -141,7 +141,7 @@ class TestDailymotionSubtitles(BaseTestSubtitles):
@is_download_test @is_download_test
class TestTedSubtitles(BaseTestSubtitles): class TestTedSubtitles(BaseTestSubtitles):
url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html'
IE = TEDIE IE = TedTalkIE
def test_allsubtitles(self): def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True

View File

@ -1522,7 +1522,12 @@ from .teachingchannel import TeachingChannelIE
from .teamcoco import TeamcocoIE from .teamcoco import TeamcocoIE
from .teamtreehouse import TeamTreeHouseIE from .teamtreehouse import TeamTreeHouseIE
from .techtalks import TechTalksIE from .techtalks import TechTalksIE
from .ted import TEDIE from .ted import (
TedEmbedIE,
TedPlaylistIE,
TedSeriesIE,
TedTalkIE,
)
from .tele5 import Tele5IE from .tele5 import Tele5IE
from .tele13 import Tele13IE from .tele13 import Tele13IE
from .telebruxelles import TeleBruxellesIE from .telebruxelles import TeleBruxellesIE

View File

@ -115,6 +115,7 @@ from .channel9 import Channel9IE
from .vshare import VShareIE from .vshare import VShareIE
from .mediasite import MediasiteIE from .mediasite import MediasiteIE
from .springboardplatform import SpringboardPlatformIE from .springboardplatform import SpringboardPlatformIE
from .ted import TedEmbedIE
from .yapfiles import YapFilesIE from .yapfiles import YapFilesIE
from .vice import ViceIE from .vice import ViceIE
from .xfileshare import XFileShareIE from .xfileshare import XFileShareIE
@ -3174,10 +3175,9 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'Tvigle') return self.url_result(mobj.group('url'), 'Tvigle')
# Look for embedded TED player # Look for embedded TED player
mobj = re.search( ted_urls = TedEmbedIE._extract_urls(webpage)
r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage) if ted_urls:
if mobj is not None: return self.playlist_from_matches(ted_urls, video_id, video_title, ie=TedEmbedIE.ie_key())
return self.url_result(mobj.group('url'), 'TED')
# Look for embedded Ustream videos # Look for embedded Ustream videos
ustream_url = UstreamIE._extract_url(webpage) ustream_url = UstreamIE._extract_url(webpage)

View File

@ -1,274 +1,105 @@
from __future__ import unicode_literals import itertools
import json
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urlparse
)
from ..utils import ( from ..utils import (
extract_attributes,
float_or_none,
int_or_none, int_or_none,
str_to_int,
try_get, try_get,
url_or_none, url_or_none,
unified_strdate,
parse_duration,
) )
class TEDIE(InfoExtractor): class TedBaseIE(InfoExtractor):
IE_NAME = 'ted' _VALID_URL_BASE = r'https?://www\.ted\.com/(?:{type})(?:/lang/[^/#?]+)?/(?P<id>[\w-]+)'
_VALID_URL = r'''(?x)
(?P<proto>https?://) def _parse_playlist(self, playlist):
(?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ for entry in try_get(playlist, lambda x: x['videos']['nodes'], list):
( if entry.get('__typename') == 'Video' and entry.get('canonicalUrl'):
(?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist yield self.url_result(entry['canonicalUrl'], TedTalkIE.ie_key())
|
((?P<type_talk>talks)) # We have a simple talk
| class TedTalkIE(TedBaseIE):
(?P<type_watch>watch)/[^/]+/[^/]+ _VALID_URL = TedBaseIE._VALID_URL_BASE.format(type='talks')
)
(/lang/(.*?))? # The url may contain the language
/(?P<name>[\w-]+) # Here goes the name and then ".html"
.*)$
'''
_TESTS = [{ _TESTS = [{
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', 'url': 'https://www.ted.com/talks/candace_parker_how_to_break_down_barriers_and_not_accept_limits',
'md5': 'b0ce2b05ca215042124fbc9e3886493a', 'md5': '47e82c666d9c3261d4fe74748a90aada',
'info_dict': { 'info_dict': {
'id': '102', 'id': '86532',
'ext': 'mp4', 'ext': 'mp4',
'title': 'The illusion of consciousness', 'title': 'How to break down barriers and not accept limits',
'description': ('Philosopher Dan Dennett makes a compelling ' 'description': 'md5:000707cece219d1e165b11550d612331',
'argument that not only don\'t we understand our own '
'consciousness, but that half the time our brains are '
'actively fooling us.'),
'uploader': 'Dan Dennett',
'width': 853,
'duration': 1308,
'view_count': int, 'view_count': int,
'comment_count': int, 'tags': ['personal growth', 'equality', 'activism', 'motivation', 'social change', 'sports'],
'tags': list, 'uploader': 'Candace Parker',
'duration': 676.0,
'upload_date': '20220114',
'release_date': '20211201',
'thumbnail': r're:http.*\.jpg',
}, },
'params': {
'skip_download': True,
},
}, {
# missing HTTP bitrates
'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
'info_dict': {
'id': '6069',
'ext': 'mp4',
'title': 'The beauty and power of algorithms',
'thumbnail': r're:^https?://.+\.jpg',
'description': 'md5:734e352710fb00d840ab87ae31aaf688',
'uploader': 'Vishal Sikka',
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
'info_dict': {
'id': '1972',
'ext': 'mp4',
'title': 'Be passionate. Be courageous. Be your best.',
'uploader': 'Gabby Giffords and Mark Kelly',
'description': 'md5:5174aed4d0f16021b704120360f72b92',
'duration': 1128,
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://www.ted.com/playlists/who_are_the_hackers',
'info_dict': {
'id': '10',
'title': 'Who are the hackers?',
'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a'
},
'playlist_mincount': 6,
}, {
# contains a youtube video
'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
'add_ie': ['Youtube'],
'info_dict': {
'id': '_ZG8HBuDjgc',
'ext': 'webm',
'title': 'Douglas Adams: Parrots the Universe and Everything',
'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
'uploader': 'University of California Television (UCTV)',
'uploader_id': 'UCtelevision',
'upload_date': '20080522',
},
'params': {
'skip_download': True,
},
}, {
# no nativeDownloads
'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
'info_dict': {
'id': '1792',
'ext': 'mp4',
'title': 'The orchestra in my mouth',
'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
'uploader': 'Tom Thum',
'view_count': int,
'comment_count': int,
'tags': list,
},
'params': {
'skip_download': True,
},
}, {
# with own formats and private Youtube external
'url': 'https://www.ted.com/talks/spencer_wells_a_family_tree_for_humanity',
'only_matching': True,
}] }]
_NATIVE_FORMATS = {
'low': {'width': 320, 'height': 180},
'medium': {'width': 512, 'height': 288},
'high': {'width': 854, 'height': 480},
}
def _extract_info(self, webpage):
info_json = self._search_regex(
r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>',
webpage, 'info json')
return json.loads(info_json)
def _real_extract(self, url): def _real_extract(self, url):
m = re.match(self._VALID_URL, url, re.VERBOSE) display_id = self._match_id(url)
if m.group('type').startswith('embed'): webpage = self._download_webpage(url, display_id)
desktop_url = m.group('proto') + 'www' + m.group('urlmain') talk_info = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['videoData']
return self.url_result(desktop_url, 'TED') video_id = talk_info['id']
name = m.group('name') playerData = self._parse_json(talk_info.get('playerData'), video_id)
if m.group('type_talk'):
return self._talk_info(url, name)
elif m.group('type_watch'):
return self._watch_info(url, name)
else:
return self._playlist_videos_info(url, name)
def _playlist_videos_info(self, url, name):
'''Returns the videos of the playlist'''
webpage = self._download_webpage(url, name,
'Downloading playlist webpage')
playlist_entries = []
for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage):
attrs = extract_attributes(entry)
entry_url = compat_urlparse.urljoin(url, attrs['href'])
playlist_entries.append(self.url_result(entry_url, self.ie_key()))
final_url = self._og_search_url(webpage, fatal=False)
playlist_id = (
re.match(self._VALID_URL, final_url).group('playlist_id')
if final_url else None)
return self.playlist_result(
playlist_entries, playlist_id=playlist_id,
playlist_title=self._og_search_title(webpage, fatal=False),
playlist_description=self._og_search_description(webpage))
def _talk_info(self, url, video_name):
webpage = self._download_webpage(url, video_name)
info = self._extract_info(webpage)
data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info
talk_info = data['talks'][0]
title = talk_info['title'].strip()
downloads = talk_info.get('downloads') or {}
native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {}
formats = [{
'url': format_url,
'format_id': format_id,
} for (format_id, format_url) in native_downloads.items() if format_url is not None]
subtitled_downloads = downloads.get('subtitledDownloads') or {}
for lang, subtitled_download in subtitled_downloads.items():
for q in self._NATIVE_FORMATS:
q_url = subtitled_download.get(q)
if not q_url:
continue
formats.append({
'url': q_url,
'format_id': '%s-%s' % (q, lang),
'language': lang,
})
if formats:
for f in formats:
finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0])
if finfo:
f.update(finfo)
player_talk = talk_info['player_talks'][0]
resources_ = player_talk.get('resources') or talk_info.get('resources')
http_url = None http_url = None
for format_id, resources in resources_.items(): formats, subtitles = [], {}
for format_id, resources in (playerData.get('resources') or {}).items():
if format_id == 'hls': if format_id == 'hls':
if not isinstance(resources, dict): stream_url = url_or_none(try_get(resources, lambda x: x['stream']))
continue
stream_url = url_or_none(resources.get('stream'))
if not stream_url: if not stream_url:
continue continue
formats.extend(self._extract_m3u8_formats( m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
stream_url, video_name, 'mp4', m3u8_id=format_id, stream_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
fatal=False)) formats.extend(m3u8_formats)
else: subtitles = self._merge_subtitles(subtitles, m3u8_subs)
if not isinstance(resources, list): continue
continue
if format_id == 'h264': if not isinstance(resources, list):
for resource in resources: continue
h264_url = resource.get('file') if format_id == 'h264':
if not h264_url: for resource in resources:
continue h264_url = resource.get('file')
bitrate = int_or_none(resource.get('bitrate')) if not h264_url:
formats.append({ continue
'url': h264_url, bitrate = int_or_none(resource.get('bitrate'))
'format_id': '%s-%sk' % (format_id, bitrate), formats.append({
'tbr': bitrate, 'url': h264_url,
}) 'format_id': '%s-%sk' % (format_id, bitrate),
if re.search(r'\d+k', h264_url): 'tbr': bitrate,
http_url = h264_url })
elif format_id == 'rtmp': if re.search(r'\d+k', h264_url):
streamer = talk_info.get('streamer') http_url = h264_url
if not streamer: elif format_id == 'rtmp':
continue streamer = talk_info.get('streamer')
for resource in resources: if not streamer:
formats.append({ continue
'format_id': '%s-%s' % (format_id, resource.get('name')), formats.extend({
'url': streamer, 'format_id': '%s-%s' % (format_id, resource.get('name')),
'play_path': resource['file'], 'url': streamer,
'ext': 'flv', 'play_path': resource['file'],
'width': int_or_none(resource.get('width')), 'ext': 'flv',
'height': int_or_none(resource.get('height')), 'width': int_or_none(resource.get('width')),
'tbr': int_or_none(resource.get('bitrate')), 'height': int_or_none(resource.get('height')),
}) 'tbr': int_or_none(resource.get('bitrate')),
} for resource in resources if resource.get('file'))
m3u8_formats = list(filter(
lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
formats))
if http_url: if http_url:
m3u8_formats = [f for f in formats if f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none']
for m3u8_format in m3u8_formats: for m3u8_format in m3u8_formats:
bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
if not bitrate: if not bitrate:
continue continue
bitrate_url = re.sub(r'\d+k', bitrate, http_url) bitrate_url = re.sub(r'\d+k', bitrate, http_url)
if not self._is_valid_url( if not self._is_valid_url(
bitrate_url, video_name, '%s bitrate' % bitrate): bitrate_url, video_id, '%s bitrate' % bitrate):
continue continue
f = m3u8_format.copy() f = m3u8_format.copy()
f.update({ f.update({
@ -289,79 +120,123 @@ class TEDIE(InfoExtractor):
}) })
if not formats: if not formats:
external = player_talk.get('external') external = playerData.get('external') or {}
if isinstance(external, dict): service = external.get('service') or ''
service = external.get('service') ext_url = external.get('code') if service.lower() == 'youtube' else None
if isinstance(service, compat_str): return self.url_result(ext_url or external['uri'])
ext_url = None
if service.lower() == 'youtube':
ext_url = external.get('code')
return self.url_result(ext_url or external['uri'])
self._sort_formats(formats) self._sort_formats(formats)
video_id = compat_str(talk_info['id']) thumbnail = playerData.get('thumb') or self._og_search_property('image', webpage)
if thumbnail:
# trim thumbnail resize parameters
thumbnail = thumbnail.split('?')[0]
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': talk_info.get('title') or self._og_search_title(webpage),
'uploader': player_talk.get('speaker') or talk_info.get('speaker'), 'uploader': talk_info.get('presenterDisplayName'),
'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
'description': self._og_search_description(webpage),
'subtitles': self._get_subtitles(video_id, talk_info),
'formats': formats,
'duration': float_or_none(talk_info.get('duration')),
'view_count': int_or_none(data.get('viewed_count')),
'comment_count': int_or_none(
try_get(data, lambda x: x['comments']['count'])),
'tags': try_get(talk_info, lambda x: x['tags'], list),
}
def _get_subtitles(self, video_id, talk_info):
sub_lang_list = {}
for language in try_get(
talk_info,
(lambda x: x['downloads']['languages'],
lambda x: x['languages']), list):
lang_code = language.get('languageCode') or language.get('ianaCode')
if not lang_code:
continue
sub_lang_list[lang_code] = [
{
'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
'ext': ext,
}
for ext in ['ted', 'srt']
]
return sub_lang_list
def _watch_info(self, url, name):
webpage = self._download_webpage(url, name)
config_json = self._html_search_regex(
r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
webpage, 'config', default=None)
if not config_json:
embed_url = self._search_regex(
r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
return self.url_result(self._proto_relative_url(embed_url))
config = json.loads(config_json)['config']
video_url = config['video']['url']
thumbnail = config.get('image', {}).get('url')
title = self._html_search_regex(
r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
description = self._html_search_regex(
[
r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
],
webpage, 'description', fatal=False)
return {
'id': name,
'url': video_url,
'title': title,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'description': description, 'description': talk_info.get('description') or self._og_search_description(webpage),
'subtitles': subtitles,
'formats': formats,
'duration': talk_info.get('duration') or parse_duration(self._og_search_property('video:duration', webpage)),
'view_count': str_to_int(talk_info.get('viewedCount')),
'upload_date': unified_strdate(talk_info.get('publishedAt')),
'release_date': unified_strdate(talk_info.get('recordedOn')),
'tags': try_get(playerData, lambda x: x['targeting']['tag'].split(',')),
} }
class TedSeriesIE(TedBaseIE):
_VALID_URL = fr'{TedBaseIE._VALID_URL_BASE.format(type=r"series")}(?:#season_(?P<season>\d+))?'
_TESTS = [{
'url': 'https://www.ted.com/series/small_thing_big_idea',
'info_dict': {
'id': '3',
'title': 'Small Thing Big Idea',
'series': 'Small Thing Big Idea',
'description': 'md5:6869ca52cec661aef72b3e9f7441c55c'
},
'playlist_mincount': 16,
}, {
'url': 'https://www.ted.com/series/the_way_we_work#season_2',
'info_dict': {
'id': '8_2',
'title': 'The Way We Work Season 2',
'series': 'The Way We Work',
'description': 'md5:59469256e533e1a48c4aa926a382234c',
'season_number': 2
},
'playlist_mincount': 8,
}]
def _real_extract(self, url):
display_id, season = self._match_valid_url(url).group('id', 'season')
webpage = self._download_webpage(url, display_id, 'Downloading series webpage')
info = self._search_nextjs_data(webpage, display_id)['props']['pageProps']
entries = itertools.chain.from_iterable(
self._parse_playlist(s) for s in info['seasons'] if season in [None, s.get('seasonNumber')])
series_id = try_get(info, lambda x: x['series']['id'])
series_name = try_get(info, lambda x: x['series']['name']) or self._og_search_title(webpage, fatal=False)
return self.playlist_result(
entries,
f'{series_id}_{season}' if season and series_id else series_id,
f'{series_name} Season {season}' if season else series_name,
self._og_search_description(webpage),
series=series_name, season_number=int_or_none(season))
class TedPlaylistIE(TedBaseIE):
_VALID_URL = TedBaseIE._VALID_URL_BASE.format(type=r'playlists(?:/\d+)?')
_TESTS = [{
'url': 'https://www.ted.com/playlists/171/the_most_popular_talks_of_all',
'info_dict': {
'id': '171',
'title': 'The most popular talks of all time',
'description': 'md5:d2f22831dc86c7040e733a3cb3993d78'
},
'playlist_mincount': 25,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
playlist = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['playlist']
return self.playlist_result(
self._parse_playlist(playlist), playlist.get('id'),
playlist.get('title') or self._og_search_title(webpage, default='').replace(' | TED Talks', '') or None,
self._og_search_description(webpage))
class TedEmbedIE(InfoExtractor):
_VALID_URL = r'https?://embed(?:-ssl)?\.ted\.com/'
_TESTS = [{
'url': 'https://embed.ted.com/talks/janet_stovall_how_to_get_serious_about_diversity_and_inclusion_in_the_workplace',
'info_dict': {
'id': '21802',
'ext': 'mp4',
'title': 'How to get serious about diversity and inclusion in the workplace',
'description': 'md5:0978aafe396e05341f8ecc795d22189d',
'view_count': int,
'tags': list,
'uploader': 'Janet Stovall',
'duration': 664.0,
'upload_date': '20180822',
'release_date': '20180719',
'thumbnail': r're:http.*\.jpg',
},
}]
@classmethod
def _extract_urls(cls, webpage):
return [mobj.group('url') for mobj in re.finditer(
fr'<iframe[^>]+?src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1', webpage)]
def _real_extract(self, url):
return self.url_result(re.sub(r'://embed(-ssl)?', '://www', url), TedTalkIE.ie_key())