From 401e28b318c3c38d5a9022f356972d142d585f84 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Tue, 16 Apr 2024 16:35:51 -0700 Subject: [PATCH 01/23] BBC Issue 9701: NEXT_DATA field video extraction for bbc Some bbc articles with embedded video have the data for them within a json structure tagged with NEXT_DATA. Add a parser for this case. Links tested: https://www.bbc.com/news/uk-68546268 https://www.bbc.com/news/world-middle-east-68778149 https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness --- yt_dlp/extractor/bbc.py | 52 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 015af9e1d..a0108fec5 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -602,7 +602,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade', + 'title': 'Russia stages massive WW2 parade despite Western boycott', 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', }, 'playlist_count': 2, @@ -791,6 +791,17 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'timestamp': 1638230731, 'upload_date': '20211130', }, + }, { + # video with script id __NEXT_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/uk-68546268', + 'info_dict': { + 'id': 'p0hj0lq7', + 'ext': 'mp4', + 'title': 'Nasser Hospital doctor describes his treatment by IDF', + 'description': 'Doctor Abu Sabha said he was detained by Israeli forces after the raid on Nasser Hospital and feared for his life.\n\nThe IDF said "during the activity, about 200 terrorists and suspects of terrorist activity were detained, including some who posed as medical teams, many weapons were found, as well as closed medicines intended for Israeli hostages."', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1710270205000, + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', @@ -1255,6 +1266,45 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE lambda s: self._parse_json(s, playlist_id, fatal=False), re.findall(pattern, webpage)))) + # US accessed article with single embedded video (e.g. + # https://www.bbc.com/news/uk-68546268) + video_id = self._match_id(url) + next_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['page'] + video_data = None + timestamp = None + for key in next_data: + for item in (try_get(next_data, lambda x: x[key]['contents'], list) or []): + if item.get('type') == 'video': + video_data = item + elif item.get('type') == 'timestamp': + timestamp = item + if video_data: + for item in (try_get(video_data, lambda x: x['model']['blocks'], list) or []): + if item.get('type') == 'media': + for subtype in (try_get(item, lambda x: x['model']['blocks'], list) or []): + if subtype.get('type') == 'mediaMetadata': + model = subtype.get('model') + if model: + item_id = try_get(model, lambda x: x['versions'][0]['versionId']) + item_thumbnail = model.get('imageUrl') + item_title = model.get('title') + formats, subtitles = self._download_media_selector(item_id) + synopses = model.get('synopses') or {} + item_time = None + if timestamp: + item_time = try_get(timestamp, lambda x: x['model']['timestamp']) + entries.append({ + 'id': item_id, + 'title': item_title, + 'thumbnail': item_thumbnail, + 'formats': formats, + 'subtitles': subtitles, + 'timestamp': item_time, + 'description': dict_get(synopses, ('long', 'medium', 'short')) + }) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX From 5f35e175726469477e371996d050bfe7b2c68798 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Thu, 18 Apr 2024 09:12:36 -0700 Subject: [PATCH 02/23] Using traverse_obj --- yt_dlp/extractor/bbc.py | 57 +++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index a0108fec5..1bc7a69b7 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1268,40 +1268,31 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE # US accessed article with single embedded video (e.g. # https://www.bbc.com/news/uk-68546268) - video_id = self._match_id(url) - next_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['page'] - video_data = None - timestamp = None - for key in next_data: - for item in (try_get(next_data, lambda x: x[key]['contents'], list) or []): - if item.get('type') == 'video': - video_data = item - elif item.get('type') == 'timestamp': - timestamp = item + next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id), ( + 'props', 'pageProps', 'page'), get_all=False) + video_data = traverse_obj(next_data, ( + ..., 'contents', lambda _, v: v['type'] == 'video'), get_all=False) if video_data: - for item in (try_get(video_data, lambda x: x['model']['blocks'], list) or []): - if item.get('type') == 'media': - for subtype in (try_get(item, lambda x: x['model']['blocks'], list) or []): - if subtype.get('type') == 'mediaMetadata': - model = subtype.get('model') - if model: - item_id = try_get(model, lambda x: x['versions'][0]['versionId']) - item_thumbnail = model.get('imageUrl') - item_title = model.get('title') - formats, subtitles = self._download_media_selector(item_id) - synopses = model.get('synopses') or {} - item_time = None - if timestamp: - item_time = try_get(timestamp, lambda x: x['model']['timestamp']) - entries.append({ - 'id': item_id, - 'title': item_title, - 'thumbnail': item_thumbnail, - 'formats': formats, - 'subtitles': subtitles, - 'timestamp': item_time, - 'description': dict_get(synopses, ('long', 'medium', 'short')) - }) + timestamp = traverse_obj(next_data, ( + ..., 'contents', lambda _, v: v['type'] == 'timestamp', + 'model', 'timestamp', {int_or_none}), get_all=False) + model = traverse_obj(video_data, ( + 'model', 'blocks', lambda _, v: v['type'] == 'media', + 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', + 'model'), get_all=False) + if model: + item_id = try_get(model, lambda x: x['versions'][0]['versionId']) + formats, subtitles = self._download_media_selector(item_id) + synopses = model.get('synopses') or {} + entries.append({ + 'id': item_id, + 'title': model.get('title'), + 'thumbnail': model.get('imageUrl'), + 'formats': formats, + 'subtitles': subtitles, + 'timestamp': timestamp, + 'description': dict_get(synopses, ('long', 'medium', 'short')) + }) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) From 89eaee2ff83de8bcd44472d39e89110fec8acf08 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Fri, 19 Apr 2024 10:04:05 -0700 Subject: [PATCH 03/23] one more tranverse --- yt_dlp/extractor/bbc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 1bc7a69b7..be36bbb63 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1281,7 +1281,8 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', 'model'), get_all=False) if model: - item_id = try_get(model, lambda x: x['versions'][0]['versionId']) + item_id = traverse_obj(model, ( + 'versions', 0, 'versionId'), get_all=False) formats, subtitles = self._download_media_selector(item_id) synopses = model.get('synopses') or {} entries.append({ From b9af6bf2ce8d5d8a5841dc84f0ed63d762c50e36 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Fri, 19 Apr 2024 10:06:59 -0700 Subject: [PATCH 04/23] nit, style --- yt_dlp/extractor/bbc.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index be36bbb63..9419f1ce1 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1281,8 +1281,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', 'model'), get_all=False) if model: - item_id = traverse_obj(model, ( - 'versions', 0, 'versionId'), get_all=False) + item_id = traverse_obj(model, ('versions', 0, 'versionId')) formats, subtitles = self._download_media_selector(item_id) synopses = model.get('synopses') or {} entries.append({ From 9dbd9fc8734c76e0561bfd3de4038a0fef521491 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Fri, 19 Apr 2024 10:50:22 -0700 Subject: [PATCH 05/23] more streamlining --- yt_dlp/extractor/bbc.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 9419f1ce1..f882c56b2 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1273,25 +1273,27 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE video_data = traverse_obj(next_data, ( ..., 'contents', lambda _, v: v['type'] == 'video'), get_all=False) if video_data: - timestamp = traverse_obj(next_data, ( - ..., 'contents', lambda _, v: v['type'] == 'timestamp', - 'model', 'timestamp', {int_or_none}), get_all=False) model = traverse_obj(video_data, ( 'model', 'blocks', lambda _, v: v['type'] == 'media', 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', 'model'), get_all=False) if model: - item_id = traverse_obj(model, ('versions', 0, 'versionId')) + timestamp = traverse_obj(next_data, ( + ..., 'contents', lambda _, v: v['type'] == 'timestamp', + 'model', 'timestamp', {int_or_none}, any)) + item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) formats, subtitles = self._download_media_selector(item_id) - synopses = model.get('synopses') or {} entries.append({ 'id': item_id, - 'title': model.get('title'), - 'thumbnail': model.get('imageUrl'), 'formats': formats, 'subtitles': subtitles, 'timestamp': timestamp, - 'description': dict_get(synopses, ('long', 'medium', 'short')) + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {url_or_none}), + 'description': ( + 'synopses', ('long', 'medium', 'short'), {str}, any), + }) }) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) From e2ae76e84cba1e603f792ea7a0db9903c7bfce57 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Sun, 21 Apr 2024 16:22:46 -0700 Subject: [PATCH 06/23] Making the parse_model function, address comments --- yt_dlp/extractor/bbc.py | 74 ++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index f882c56b2..46fa7b7de 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -798,9 +798,11 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'id': 'p0hj0lq7', 'ext': 'mp4', 'title': 'Nasser Hospital doctor describes his treatment by IDF', - 'description': 'Doctor Abu Sabha said he was detained by Israeli forces after the raid on Nasser Hospital and feared for his life.\n\nThe IDF said "during the activity, about 200 terrorists and suspects of terrorist activity were detained, including some who posed as medical teams, many weapons were found, as well as closed medicines intended for Israeli hostages."', + 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276,} hostages\."$', 'thumbnail': r're:https?://.+/.+\.jpg', - 'timestamp': 1710270205000, + 'timestamp': 1710188248, + 'upload_date': '20240311', + 'duration': 104, }, }, { # single video article embedded with data-media-vpid @@ -1266,37 +1268,47 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE lambda s: self._parse_json(s, playlist_id, fatal=False), re.findall(pattern, webpage)))) + def parse_model(model): + '''Extract single video from model structure''' + if(type(model) == list): + model = model[0] + item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) + if not item_id: + return + formats, subtitles = self._download_media_selector(item_id) + return { + 'id': item_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ( + 'synopses', ('long', 'medium', 'short'), {str}, any), + 'duration': ('versions', 0, 'duration', {int}), + 'timestamp': ('versions', 0, 'availableFrom', {lambda x: int_or_none(x, scale=1000)}), + }) + } + # US accessed article with single embedded video (e.g. # https://www.bbc.com/news/uk-68546268) - next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id), ( - 'props', 'pageProps', 'page'), get_all=False) - video_data = traverse_obj(next_data, ( - ..., 'contents', lambda _, v: v['type'] == 'video'), get_all=False) - if video_data: - model = traverse_obj(video_data, ( - 'model', 'blocks', lambda _, v: v['type'] == 'media', - 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', - 'model'), get_all=False) - if model: - timestamp = traverse_obj(next_data, ( - ..., 'contents', lambda _, v: v['type'] == 'timestamp', - 'model', 'timestamp', {int_or_none}, any)) - item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) - formats, subtitles = self._download_media_selector(item_id) - entries.append({ - 'id': item_id, - 'formats': formats, - 'subtitles': subtitles, - 'timestamp': timestamp, - **traverse_obj(model, { - 'title': ('title', {str}), - 'thumbnail': ('imageUrl', {url_or_none}), - 'description': ( - 'synopses', ('long', 'medium', 'short'), {str}, any), - }) - }) - return self.playlist_result( - entries, playlist_id, playlist_title, playlist_description) + next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), ( + 'props', 'pageProps', 'page')) + model = traverse_obj(next_data, ( + ..., 'contents', lambda _, v: v['type'] == 'video', + 'model', 'blocks', lambda _, v: v['type'] == 'media', + 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', + 'model')) + if model: + entry = parse_model(model) + if entry: + if entry.get('timestamp') is None: + entry['timestamp'] = traverse_obj(next_data, ( + ..., 'contents', lambda _, v: v['type'] == 'timestamp', + 'model', 'timestamp', {functools.partial(int_or_none, scale=1000)}, any)) + entries.append(entry) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) From 4b9a54b464bd9ddc57170f27a8ffd4ac6a987bd3 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Sun, 21 Apr 2024 16:41:19 -0700 Subject: [PATCH 07/23] flake8 check --- yt_dlp/extractor/bbc.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 46fa7b7de..50ccf922f 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1270,7 +1270,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE def parse_model(model): '''Extract single video from model structure''' - if(type(model) == list): + if isinstance(model, list): model = model[0] item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) if not item_id: @@ -1285,10 +1285,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), 'description': ( 'synopses', ('long', 'medium', 'short'), {str}, any), - 'duration': ('versions', 0, 'duration', {int}), - 'timestamp': ('versions', 0, 'availableFrom', {lambda x: int_or_none(x, scale=1000)}), - }) - } + 'duration': ('versions', 0, 'duration', {int}), + 'timestamp': ('versions', 0, 'availableFrom', {lambda x: int_or_none(x, scale=1000)}), + }) + } # US accessed article with single embedded video (e.g. # https://www.bbc.com/news/uk-68546268) From 6ef899032037351f10a92145f931962c058e2300 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Sun, 21 Apr 2024 18:17:01 -0700 Subject: [PATCH 08/23] different solution for traversal issues --- yt_dlp/extractor/bbc.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 50ccf922f..bf867394b 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -798,7 +798,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'id': 'p0hj0lq7', 'ext': 'mp4', 'title': 'Nasser Hospital doctor describes his treatment by IDF', - 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276,} hostages\."$', + 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$', 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1710188248, 'upload_date': '20240311', @@ -1270,8 +1270,6 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE def parse_model(model): '''Extract single video from model structure''' - if isinstance(model, list): - model = model[0] item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) if not item_id: return @@ -1298,7 +1296,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE ..., 'contents', lambda _, v: v['type'] == 'video', 'model', 'blocks', lambda _, v: v['type'] == 'media', 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', - 'model')) + 'model', {dict}, any)) if model: entry = parse_model(model) if entry: From fba5c8f305f006d30e25022a691bac13777fbb37 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Mon, 22 Apr 2024 17:02:19 -0700 Subject: [PATCH 09/23] Incorporating changes for UK accessed articles --- yt_dlp/extractor/bbc.py | 346 +++++++++++++++++++++++++--------------- 1 file changed, 220 insertions(+), 126 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index bf867394b..a8fd7f3cd 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -17,6 +17,7 @@ from ..utils import ( int_or_none, join_nonempty, js_to_json, + merge_dicts, parse_duration, parse_iso8601, parse_qs, @@ -43,6 +44,7 @@ class BBCCoUkIE(InfoExtractor): iplayer(?:/[^/]+)?/(?:episode/|playlist/)| music/(?:clips|audiovideo/popular)[/#]| radio/player/| + sounds/play/| events/[^/]+/play/[^/]+/ ) (?P%s)(?!/(?:episodes|broadcasts|clips)) @@ -623,6 +625,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': '3662a707-0af9-3149-963f-47bea720b460', 'title': 'BUGGER', + 'description': r're:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$', }, 'playlist_count': 18, }, { @@ -631,14 +634,14 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': 'p02mprgb', 'ext': 'mp4', - 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'md5:2868290467291b37feda7863f7a83f54', - 'duration': 47, + 'title': 'Germanwings crash site aerial video', + 'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$', + 'duration': None, # 47, 'timestamp': 1427219242, 'upload_date': '20150324', + 'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg', }, 'params': { - # rtmp download 'skip_download': True, } }, { @@ -656,7 +659,8 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'params': { 'skip_download': True, - } + }, + 'skip': 'now SIMORGH_DATA with no video', }, { # single video embedded with data-playable containing XML playlists (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', @@ -670,7 +674,9 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'params': { 'skip_download': True, - } + }, + # TODO: now in .pageData.promo.media of SIMORGH_DATA + 'skip': 'video extraction failed', }, { # single video from video playlist embedded with vxp-playlist-data JSON 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', @@ -683,22 +689,22 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'params': { 'skip_download': True, - } + }, + 'skip': '404 Not Found', }, { # single video story with digitalData 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', 'info_dict': { 'id': 'p02q6gc4', - 'ext': 'flv', - 'title': 'Sri Lanka’s spicy secret', - 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', - 'timestamp': 1437674293, - 'upload_date': '20150723', + 'ext': 'mp4', + # page title: 'Sri Lanka’s spicy secret', + 'title': 'Tasting the spice of life in Jaffna', + # page description: 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', + 'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{149} aftertaste\.$', + 'timestamp': 1437935638, # was: 1437674293, + 'upload_date': '20150726', + 'duration': 255, }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { # single video story without digitalData 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', @@ -710,12 +716,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'timestamp': 1415867444, 'upload_date': '20141113', }, - 'params': { - # rtmp download - 'skip_download': True, - } + 'skip': 'redirects to TopGear home page', }, { # single video embedded with Morph + # TODO: replacement test page 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', 'info_dict': { 'id': 'p041vhd0', @@ -726,27 +730,22 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'uploader': 'BBC Sport', 'uploader_id': 'bbc_sport', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Georestricted to UK', + 'skip': 'Video no longer in page', }, { - # single video with playlist.sxml URL in playlist param + # single video in __INITIAL_DATA__ (was: playlist.sxml URL in playlist param) 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', - 'duration': 140, + 'title': 'Ronaldo to Man Utd, Arsenal to spend?', + 'description': r'''re:(?s)BBC Sport's David Ornstein rounds up the latest transfer reports, .{359} here\.$''', + 'timestamp': 1437750175, + 'upload_date': '20150724', + 'thumbnail': 'https://news.bbcimg.co.uk/media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png', + 'duration': None, # 140, }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { - # article with multiple videos embedded with playlist.sxml in playlist param + # article with multiple videos embedded with Morph.setPayload 'url': 'http://www.bbc.com/sport/0/football/34475836', 'info_dict': { 'id': '34475836', @@ -754,6 +753,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', }, 'playlist_count': 3, + }, { + # lead item from above playlist + 'url': 'http://www.bbc.com/sport/0/football/34475836', + 'info_dict': { + 'id': 'p034ppnv', + 'ext': 'mp4', + 'title': 'All you need to know about Jurgen Klopp', + 'timestamp': 1444335081, + 'upload_date': '20151008', + 'duration': 122.0, + 'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg', + }, + 'params': { + 'noplaylist': True, + }, }, { # school report article with single video 'url': 'http://www.bbc.co.uk/schoolreport/35744779', @@ -762,6 +776,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'title': 'School which breaks down barriers in Jerusalem', }, 'playlist_count': 1, + 'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt', }, { # single video with playlist URL from weather section 'url': 'http://www.bbc.com/weather/features/33601775', @@ -783,10 +798,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE # video with window.__INITIAL_DATA__ and value as JSON string 'url': 'https://www.bbc.com/news/av/world-europe-59468682', 'info_dict': { - 'id': 'p0b71qth', + 'id': 'p0b779gc', # was 'p0b71qth', 'ext': 'mp4', 'title': 'Why France is making this woman a national hero', - 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{291} Casseville$', 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1638230731, 'upload_date': '20211130', @@ -830,6 +845,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'uploader': 'Radio 3', 'uploader_id': 'bbc_radio_three', }, + 'skip': '404 Not Found', }, { 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', 'info_dict': { @@ -837,6 +853,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'ext': 'mp4', 'title': 'md5:2fabf12a726603193a2879a055f72514', 'description': 'Learn English words and phrases from this story', + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg', }, 'add_ie': [BBCCoUkIE.ie_key()], }, { @@ -849,7 +866,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'alt_title': 'The downsides of positive thinking', 'description': 'md5:fad74b31da60d83b8265954ee42d85b4', 'duration': 235, - 'thumbnail': r're:https?://.+/p07c9dsr.jpg', + 'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)', 'upload_date': '20190604', 'categories': ['Psychology'], }, @@ -867,6 +884,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'duration': 1800, 'uploader_id': 'bbc_radio_three', }, + 'skip': '404 Not Found', }, { # onion routes 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', 'only_matching': True, @@ -1082,83 +1100,141 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE } # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) - # There are several setPayload calls may be present but the video - # seems to be always related to the first one - morph_payload = self._parse_json( - self._search_regex( - r'Morph\.setPayload\([^,]+,\s*({.+?})\);', - webpage, 'morph payload', default='{}'), - playlist_id, fatal=False) + # Several setPayload calls may be present but the video(s) + # should be in one that mentions leadMedia or videoData + morph_payload = self._search_json( + r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id, + contains_pattern=r'\{(?:(?!)[\s\S])+?(?:"leadMedia"|\\"videoData\\")\s*:(?:(?!)[\s\S])+\}', + default={}) if morph_payload: - components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] - for component in components: - if not isinstance(component, dict): - continue - lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) - if not lead_media: - continue - identifiers = lead_media.get('identifiers') - if not identifiers or not isinstance(identifiers, dict): - continue - programme_id = identifiers.get('vpid') or identifiers.get('playablePid') + for component in traverse_obj(morph_payload, ( + 'body', 'components', lambda _, v: v['props']['leadMedia']['identifiers'])): + lead_media = component['props']['leadMedia'] + programme_id = traverse_obj(lead_media['identifiers'], 'vpid', 'playablePid', expected_type=str) if not programme_id: continue title = lead_media.get('title') or self._og_search_title(webpage) formats, subtitles = self._download_media_selector(programme_id) - description = lead_media.get('summary') - uploader = lead_media.get('masterBrand') - uploader_id = lead_media.get('mid') - duration = None - duration_d = lead_media.get('duration') - if isinstance(duration_d, dict): - duration = parse_duration(dict_get( - duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) return { 'id': programme_id, 'title': title, - 'description': description, - 'duration': duration, - 'uploader': uploader, - 'uploader_id': uploader_id, + **traverse_obj(lead_media, { + 'description': ('summary', {str}), + 'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}), + 'uploader': ('masterBrand', {str}), + 'uploader_id': ('mid', {str}), + }), 'formats': formats, 'subtitles': subtitles, } + body = traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'body', + {lambda s: self._parse_json(s, playlist_id, fatal=False)})) + added = False + for video_data in traverse_obj(body, (Ellipsis, 'videoData', {lambda v: v.get('pid') and v})): + if video_data.get('vpid'): + video_id = video_data['vpid'] + formats, subtitles = self._download_media_selector(video_id) + entry = { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + } + else: + video_id = video_data['pid'] + entry = self.url_result( + 'https://www.bbc.co.uk/programmes/%s' % video_id, BBCCoUkIE.ie_key(), + video_id, url_transparent=True) + entry = merge_dicts( + traverse_obj(morph_payload, ( + 'body', 'content', 'article', { + 'timestamp': ('dateTimeInfo', 'dateTime', {parse_iso8601}), + })), traverse_obj(video_data, { + 'thumbnail': (('iChefImage', 'image'), {url_or_none}, any), + 'title': (('title', 'caption'), {str}, any), + 'duration': ('duration', {parse_duration}), + }), entry) + if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id): + return entry + entries.append(entry) + added = True + if added: + playlist_title = traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'headline', {str})) or playlist_title + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + + # various PRELOADED_STATE JSON + preload_state = self._search_json( + r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage, + 'preload state', playlist_id, transform_source=js_to_json, default={}) + # PRELOADED_STATE with current programmme + current_programme = traverse_obj(preload_state, ( + 'programmes', 'current', {dict})) + if current_programme: + programme_id = traverse_obj(current_programme, ('id', {str})) + if programme_id and current_programme.get('type') == 'playable_item': + title = traverse_obj(current_programme, ('titles', 'tertiary', {str})) or playlist_title + formats, subtitles = self._download_media_selector(programme_id) + return { + 'id': programme_id, + 'title': title, + 'formats': formats, + **traverse_obj(current_programme, { + 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), + 'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}), + 'duration': ('duration', 'value', {int_or_none}), + 'uploader': ('network', 'short_title', {str}), + 'uploader_id': ('network', 'id', {str}), + }), + 'subtitles': subtitles, + **traverse_obj(preload_state, { + 'chapters': ( + 'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), { + 'title': ('titles', {lambda x: join_nonempty( + 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), + 'start_time': ('offset', 'start', {float_or_none}), + 'end_time': ('offset', 'end', {float_or_none}), + } + ) + }), + } - preload_state = self._parse_json(self._search_regex( - r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) - if preload_state: - current_programme = preload_state.get('programmes', {}).get('current') or {} - programme_id = current_programme.get('id') - if current_programme and programme_id and current_programme.get('type') == 'playable_item': - title = current_programme.get('titles', {}).get('tertiary') or playlist_title - formats, subtitles = self._download_media_selector(programme_id) - synopses = current_programme.get('synopses') or {} - network = current_programme.get('network') or {} - duration = int_or_none( - current_programme.get('duration', {}).get('value')) - thumbnail = None - image_url = current_programme.get('image_url') - if image_url: - thumbnail = image_url.replace('{recipe}', 'raw') + # PWA_PRELOADED_STATE with article video asset + asset_id = traverse_obj(preload_state, ( + 'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id, + 'assetVideo', 0, {str}, any)) + if asset_id: + video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str})) + if video_id: + article = traverse_obj(preload_state, ( + 'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any)) + + def image_url(image_id): + return traverse_obj(preload_state, ( + 'entities', 'images', image_id, 'url', + {lambda u: url_or_none(u.replace('$recipe', 'raw'))})) + + formats, subtitles = self._download_media_selector(video_id) return { - 'id': programme_id, - 'title': title, - 'description': dict_get(synopses, ('long', 'medium', 'short')), - 'thumbnail': thumbnail, - 'duration': duration, - 'uploader': network.get('short_title'), - 'uploader_id': network.get('id'), + 'id': video_id, + **traverse_obj(preload_state, ('entities', 'videos', asset_id, { + 'title': ('title', {str}), + 'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any), + 'thumbnail': (0, {image_url}), + 'duration': ('duration', {int_or_none}), + })), 'formats': formats, 'subtitles': subtitles, - 'chapters': traverse_obj(preload_state, ( - 'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), { - 'title': ('titles', {lambda x: join_nonempty( - 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), - 'start_time': ('offset', 'start', {float_or_none}), - 'end_time': ('offset', 'end', {float_or_none}), - })) or None, + **traverse_obj(article, { + 'timestamp': ('displayDate', {parse_iso8601}), + }), } + else: + return self.url_result( + 'https://www.bbc.co.uk/programmes/%s' % asset_id, BBCCoUkIE.ie_key(), + asset_id, playlist_title, display_id=playlist_id, + description=playlist_description) bbc3_config = self._parse_json( self._search_regex( @@ -1204,6 +1280,28 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + k_int_or_none = functools.partial(int_or_none, scale=1000) + + def parse_model(model): + '''Extract single video from model structure''' + item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) + if not item_id: + return + formats, subtitles = self._download_media_selector(item_id) + return { + 'id': item_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ( + 'synopses', ('long', 'medium', 'short'), {str}, any), + 'duration': ('versions', 0, 'duration', {int}), + 'timestamp': ('versions', 0, 'availableFrom', {k_int_or_none}), + }) + } + initial_data = self._search_regex( r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, 'quoted preload state', default=None) @@ -1215,6 +1313,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: + added = False + for video_data in traverse_obj(initial_data, ( + 'stores', 'article', 'articleBodyContent', lambda _, v: v['type'] == 'video')): + model = traverse_obj(video_data, ( + 'model', 'blocks', lambda _, v: v['type'] == 'aresMedia', + 'model', 'blocks', lambda _, v: v['type'] == 'aresMediaMetadata', + 'model', {dict}, any)) + entry = parse_model(model) + if entry: + entries.append(entry) + added = True + if added: + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def parse_media(media): if not media: return @@ -1248,18 +1361,19 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'timestamp': item_time, 'description': strip_or_none(item_desc), }) + + for resp in traverse_obj(initial_data, ('data', lambda _, v: v.get('name'))): + name = resp['name'] for resp in (initial_data.get('data') or {}).values(): name = resp.get('name') if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in (try_get(resp, - (lambda x: x['data']['blocks'], - lambda x: x['data']['content']['model']['blocks'],), - list) or []): - if block.get('type') not in ['media', 'video']: - continue - parse_media(block.get('model')) + for block in traverse_obj(resp, ('data', ( + None, ('content', 'model')), 'blocks', + lambda _, v: v.get('type') in {'media', 'video'}, + 'model', {dict})): + parse_media(block) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) @@ -1268,26 +1382,6 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE lambda s: self._parse_json(s, playlist_id, fatal=False), re.findall(pattern, webpage)))) - def parse_model(model): - '''Extract single video from model structure''' - item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) - if not item_id: - return - formats, subtitles = self._download_media_selector(item_id) - return { - 'id': item_id, - 'formats': formats, - 'subtitles': subtitles, - **traverse_obj(model, { - 'title': ('title', {str}), - 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), - 'description': ( - 'synopses', ('long', 'medium', 'short'), {str}, any), - 'duration': ('versions', 0, 'duration', {int}), - 'timestamp': ('versions', 0, 'availableFrom', {lambda x: int_or_none(x, scale=1000)}), - }) - } - # US accessed article with single embedded video (e.g. # https://www.bbc.com/news/uk-68546268) next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), ( @@ -1303,7 +1397,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE if entry.get('timestamp') is None: entry['timestamp'] = traverse_obj(next_data, ( ..., 'contents', lambda _, v: v['type'] == 'timestamp', - 'model', 'timestamp', {functools.partial(int_or_none, scale=1000)}, any)) + 'model', 'timestamp', {k_int_or_none}, any)) entries.append(entry) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) From bdfeb4357b56908534c2a0539905f08731adac45 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Tue, 23 Apr 2024 10:06:24 -0700 Subject: [PATCH 10/23] Removing unnecessary URL regex matcher clause --- yt_dlp/extractor/bbc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index a8fd7f3cd..b753282be 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -44,7 +44,6 @@ class BBCCoUkIE(InfoExtractor): iplayer(?:/[^/]+)?/(?:episode/|playlist/)| music/(?:clips|audiovideo/popular)[/#]| radio/player/| - sounds/play/| events/[^/]+/play/[^/]+/ ) (?P%s)(?!/(?:episodes|broadcasts|clips)) From d5b48c06e6b13b86cd9c8e432ae58c10984914ab Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Tue, 23 Apr 2024 15:34:27 -0700 Subject: [PATCH 11/23] address comments --- yt_dlp/extractor/bbc.py | 143 ++++++++++++++++++---------------------- 1 file changed, 65 insertions(+), 78 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index b753282be..5292d8ff3 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -17,7 +17,6 @@ from ..utils import ( int_or_none, join_nonempty, js_to_json, - merge_dicts, parse_duration, parse_iso8601, parse_qs, @@ -635,7 +634,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'ext': 'mp4', 'title': 'Germanwings crash site aerial video', 'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$', - 'duration': None, # 47, + 'duration': 47, 'timestamp': 1427219242, 'upload_date': '20150324', 'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg', @@ -675,7 +674,6 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'skip_download': True, }, # TODO: now in .pageData.promo.media of SIMORGH_DATA - 'skip': 'video extraction failed', }, { # single video from video playlist embedded with vxp-playlist-data JSON 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', @@ -696,11 +694,9 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': 'p02q6gc4', 'ext': 'mp4', - # page title: 'Sri Lanka’s spicy secret', 'title': 'Tasting the spice of life in Jaffna', - # page description: 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', 'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{149} aftertaste\.$', - 'timestamp': 1437935638, # was: 1437674293, + 'timestamp': 1437935638, 'upload_date': '20150726', 'duration': 255, }, @@ -731,17 +727,17 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'skip': 'Video no longer in page', }, { - # single video in __INITIAL_DATA__ (was: playlist.sxml URL in playlist param) + # single video in __INITIAL_DATA__ 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', 'ext': 'mp4', 'title': 'Ronaldo to Man Utd, Arsenal to spend?', - 'description': r'''re:(?s)BBC Sport's David Ornstein rounds up the latest transfer reports, .{359} here\.$''', + 'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$', 'timestamp': 1437750175, 'upload_date': '20150724', 'thumbnail': 'https://news.bbcimg.co.uk/media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png', - 'duration': None, # 140, + 'duration': 140, }, }, { # article with multiple videos embedded with Morph.setPayload @@ -753,7 +749,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'playlist_count': 3, }, { - # lead item from above playlist + # Testing noplaylist 'url': 'http://www.bbc.com/sport/0/football/34475836', 'info_dict': { 'id': 'p034ppnv', @@ -797,7 +793,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE # video with window.__INITIAL_DATA__ and value as JSON string 'url': 'https://www.bbc.com/news/av/world-europe-59468682', 'info_dict': { - 'id': 'p0b779gc', # was 'p0b71qth', + 'id': 'p0b779gc', 'ext': 'mp4', 'title': 'Why France is making this woman a national hero', 'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{291} Casseville$', @@ -1038,8 +1034,8 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE webpage, 'group id', default=None) if group_id: return self.url_result( - 'https://www.bbc.co.uk/programmes/%s' % group_id, - ie=BBCCoUkIE.ie_key()) + f'https://www.bbc.co.uk/programmes/{group_id}', + ie=BBCCoUkIE) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( @@ -1106,17 +1102,15 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE contains_pattern=r'\{(?:(?!)[\s\S])+?(?:"leadMedia"|\\"videoData\\")\s*:(?:(?!)[\s\S])+\}', default={}) if morph_payload: - for component in traverse_obj(morph_payload, ( - 'body', 'components', lambda _, v: v['props']['leadMedia']['identifiers'])): - lead_media = component['props']['leadMedia'] - programme_id = traverse_obj(lead_media['identifiers'], 'vpid', 'playablePid', expected_type=str) + for lead_media in traverse_obj(morph_payload, ( + 'body', 'components', ..., 'props', 'leadMedia', {dict})): + programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any)) if not programme_id: continue - title = lead_media.get('title') or self._og_search_title(webpage) formats, subtitles = self._download_media_selector(programme_id) return { 'id': programme_id, - 'title': title, + 'title': lead_media.get('title') or self._og_search_title(webpage), **traverse_obj(lead_media, { 'description': ('summary', {str}), 'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}), @@ -1126,11 +1120,9 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'formats': formats, 'subtitles': subtitles, } - body = traverse_obj(morph_payload, ( - 'body', 'content', 'article', 'body', - {lambda s: self._parse_json(s, playlist_id, fatal=False)})) - added = False - for video_data in traverse_obj(body, (Ellipsis, 'videoData', {lambda v: v.get('pid') and v})): + body = self._parse_json(traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'body')), playlist_id, fatal=False) + for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')): if video_data.get('vpid'): video_id = video_data['vpid'] formats, subtitles = self._download_media_selector(video_id) @@ -1142,22 +1134,24 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE else: video_id = video_data['pid'] entry = self.url_result( - 'https://www.bbc.co.uk/programmes/%s' % video_id, BBCCoUkIE.ie_key(), + f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE, video_id, url_transparent=True) - entry = merge_dicts( - traverse_obj(morph_payload, ( + entry = { + **traverse_obj(morph_payload, ( 'body', 'content', 'article', { 'timestamp': ('dateTimeInfo', 'dateTime', {parse_iso8601}), - })), traverse_obj(video_data, { - 'thumbnail': (('iChefImage', 'image'), {url_or_none}, any), - 'title': (('title', 'caption'), {str}, any), - 'duration': ('duration', {parse_duration}), - }), entry) + } + )), + **traverse_obj(video_data, { + 'thumbnail': (('iChefImage', 'image'), {url_or_none}, any), + 'title': (('title', 'caption'), {str}, any), + 'duration': ('duration', {parse_duration}), + }), + } if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id): return entry entries.append(entry) - added = True - if added: + if entries: playlist_title = traverse_obj(morph_payload, ( 'body', 'content', 'article', 'headline', {str})) or playlist_title return self.playlist_result( @@ -1168,36 +1162,34 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage, 'preload state', playlist_id, transform_source=js_to_json, default={}) # PRELOADED_STATE with current programmme - current_programme = traverse_obj(preload_state, ( - 'programmes', 'current', {dict})) - if current_programme: - programme_id = traverse_obj(current_programme, ('id', {str})) - if programme_id and current_programme.get('type') == 'playable_item': - title = traverse_obj(current_programme, ('titles', 'tertiary', {str})) or playlist_title - formats, subtitles = self._download_media_selector(programme_id) - return { - 'id': programme_id, - 'title': title, - 'formats': formats, - **traverse_obj(current_programme, { - 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), - 'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}), - 'duration': ('duration', 'value', {int_or_none}), - 'uploader': ('network', 'short_title', {str}), - 'uploader_id': ('network', 'id', {str}), - }), - 'subtitles': subtitles, - **traverse_obj(preload_state, { - 'chapters': ( - 'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), { - 'title': ('titles', {lambda x: join_nonempty( - 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), - 'start_time': ('offset', 'start', {float_or_none}), - 'end_time': ('offset', 'end', {float_or_none}), - } - ) - }), - } + current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict})) + programme_id = traverse_obj(current_programme, ('id', {str})) + if programme_id and current_programme.get('type') == 'playable_item': + title = traverse_obj(current_programme, ('titles', 'tertiary', {str})) or playlist_title + formats, subtitles = self._download_media_selector(programme_id) + return { + 'id': programme_id, + 'title': title, + 'formats': formats, + **traverse_obj(current_programme, { + 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), + 'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}), + 'duration': ('duration', 'value', {int_or_none}), + 'uploader': ('network', 'short_title', {str}), + 'uploader_id': ('network', 'id', {str}), + }), + 'subtitles': subtitles, + **traverse_obj(preload_state, { + 'chapters': ( + 'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), { + 'title': ('titles', {lambda x: join_nonempty( + 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), + 'start_time': ('offset', 'start', {float_or_none}), + 'end_time': ('offset', 'end', {float_or_none}), + } + ) + }), + } # PWA_PRELOADED_STATE with article video asset asset_id = traverse_obj(preload_state, ( @@ -1231,7 +1223,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE } else: return self.url_result( - 'https://www.bbc.co.uk/programmes/%s' % asset_id, BBCCoUkIE.ie_key(), + f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE, asset_id, playlist_title, display_id=playlist_id, description=playlist_description) @@ -1282,7 +1274,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE k_int_or_none = functools.partial(int_or_none, scale=1000) def parse_model(model): - '''Extract single video from model structure''' + """Extract single video from model structure""" item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) if not item_id: return @@ -1312,7 +1304,6 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: - added = False for video_data in traverse_obj(initial_data, ( 'stores', 'article', 'articleBodyContent', lambda _, v: v['type'] == 'video')): model = traverse_obj(video_data, ( @@ -1322,8 +1313,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE entry = parse_model(model) if entry: entries.append(entry) - added = True - if added: + if entries: return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) @@ -1361,15 +1351,13 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'description': strip_or_none(item_desc), }) - for resp in traverse_obj(initial_data, ('data', lambda _, v: v.get('name'))): - name = resp['name'] for resp in (initial_data.get('data') or {}).values(): name = resp.get('name') if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in traverse_obj(resp, ('data', ( - None, ('content', 'model')), 'blocks', + for block in traverse_obj(resp, ( + 'data', (None, ('content', 'model')), 'blocks', lambda _, v: v.get('type') in {'media', 'video'}, 'model', {dict})): parse_media(block) @@ -1383,17 +1371,16 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE # US accessed article with single embedded video (e.g. # https://www.bbc.com/news/uk-68546268) - next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), ( - 'props', 'pageProps', 'page')) + next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), + ('props', 'pageProps', 'page')) model = traverse_obj(next_data, ( ..., 'contents', lambda _, v: v['type'] == 'video', 'model', 'blocks', lambda _, v: v['type'] == 'media', 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', 'model', {dict}, any)) if model: - entry = parse_model(model) - if entry: - if entry.get('timestamp') is None: + if entry := parse_model(model): + if not entry.get('timestamp'): entry['timestamp'] = traverse_obj(next_data, ( ..., 'contents', lambda _, v: v['type'] == 'timestamp', 'model', 'timestamp', {k_int_or_none}, any)) From 1d851a6751ef5295eb339fd88534aa23180c5cd3 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Tue, 23 Apr 2024 16:15:22 -0700 Subject: [PATCH 12/23] function for lambda --- yt_dlp/extractor/bbc.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 5292d8ff3..a8d4d5ee1 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1293,6 +1293,9 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }) } + def is_type(*types): + return lambda _, v: v['type'] in types + initial_data = self._search_regex( r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, 'quoted preload state', default=None) @@ -1305,10 +1308,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: for video_data in traverse_obj(initial_data, ( - 'stores', 'article', 'articleBodyContent', lambda _, v: v['type'] == 'video')): + 'stores', 'article', 'articleBodyContent', is_type('video'))): model = traverse_obj(video_data, ( - 'model', 'blocks', lambda _, v: v['type'] == 'aresMedia', - 'model', 'blocks', lambda _, v: v['type'] == 'aresMediaMetadata', + 'model', 'blocks', is_type('aresMedia'), + 'model', 'blocks', is_type('aresMediaMetadata'), 'model', {dict}, any)) entry = parse_model(model) if entry: @@ -1374,15 +1377,15 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), ('props', 'pageProps', 'page')) model = traverse_obj(next_data, ( - ..., 'contents', lambda _, v: v['type'] == 'video', - 'model', 'blocks', lambda _, v: v['type'] == 'media', - 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', + ..., 'contents', is_type('video'), + 'model', 'blocks', is_type('media'), + 'model', 'blocks', is_type('mediaMetadata'), 'model', {dict}, any)) if model: if entry := parse_model(model): if not entry.get('timestamp'): entry['timestamp'] = traverse_obj(next_data, ( - ..., 'contents', lambda _, v: v['type'] == 'timestamp', + ..., 'contents', is_type('timestamp'), 'model', 'timestamp', {k_int_or_none}, any)) entries.append(entry) return self.playlist_result( From ab1cfa399ba427f31d468c68ccf775a2b61d8a46 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Wed, 24 Apr 2024 11:48:33 -0700 Subject: [PATCH 13/23] dirk's updates --- yt_dlp/extractor/bbc.py | 85 +++++++++++++++++++++++++++++++++-------- 1 file changed, 69 insertions(+), 16 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index a8d4d5ee1..ca0998a3c 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -663,7 +663,8 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE # single video embedded with data-playable containing XML playlists (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'info_dict': { - 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', + 'id': '39275083', + 'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', @@ -673,7 +674,6 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'params': { 'skip_download': True, }, - # TODO: now in .pageData.promo.media of SIMORGH_DATA }, { # single video from video playlist embedded with vxp-playlist-data JSON 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', @@ -689,7 +689,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'skip': '404 Not Found', }, { - # single video story with digitalData + # single video story with __PWA_PRELOADED_STATE__ 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', 'info_dict': { 'id': 'p02q6gc4', @@ -736,7 +736,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$', 'timestamp': 1437750175, 'upload_date': '20150724', - 'thumbnail': 'https://news.bbcimg.co.uk/media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png', + 'thumbnail': r're:https://(?:[^/]+/)+/media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png', 'duration': 140, }, }, { @@ -788,6 +788,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1437785037, 'upload_date': '20150725', + 'duration': 105, }, }, { # video with window.__INITIAL_DATA__ and value as JSON string @@ -800,6 +801,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1638230731, 'upload_date': '20211130', + 'duration': 125, }, }, { # video with script id __NEXT_DATA__ and value as JSON string @@ -867,19 +869,20 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, }, { # BBC Sounds - 'url': 'https://www.bbc.co.uk/sounds/play/m001q78b', + 'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx', 'info_dict': { - 'id': 'm001q789', + 'id': 'p0hrw4nr', 'ext': 'mp4', - 'title': 'The Night Tracks Mix - Music for the darkling hour', - 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg', - 'chapters': 'count:8', - 'description': 'md5:815fb51cbdaa270040aab8145b3f1d67', - 'uploader': 'Radio 3', - 'duration': 1800, - 'uploader_id': 'bbc_radio_three', - }, - 'skip': '404 Not Found', + 'title': 'Are our coastlines being washed away?', + 'description': r're:(?s)Around the world, coastlines are constantly changing .{2153} Images\)$', + 'timestamp': 1713556800, + 'upload_date': '20240419', + 'duration': 1588, + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg', + 'uploader': 'World Service', + 'uploader_id': 'bbc_world_service', + 'series': 'CrowdScience', + } }, { # onion routes 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', 'only_matching': True, @@ -1165,7 +1168,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict})) programme_id = traverse_obj(current_programme, ('id', {str})) if programme_id and current_programme.get('type') == 'playable_item': - title = traverse_obj(current_programme, ('titles', 'tertiary', {str})) or playlist_title + title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title formats, subtitles = self._download_media_selector(programme_id) return { 'id': programme_id, @@ -1177,6 +1180,8 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'duration': ('duration', 'value', {int_or_none}), 'uploader': ('network', 'short_title', {str}), 'uploader_id': ('network', 'id', {str}), + 'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any), + 'series': ('titles', 'primary', {str}), }), 'subtitles': subtitles, **traverse_obj(preload_state, { @@ -1367,6 +1372,54 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + # extract from SIMORGH_DATA hydration JSON + simorgh_data = self._search_json( + r'window\s*\.\s*SIMORGH_DATA\s*=', webpage, + 'simorgh data', playlist_id, default={}) + if simorgh_data: + done = False + for video_data in traverse_obj(simorgh_data, ( + 'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))): + model = traverse_obj(video_data, ( + 'model', 'blocks', is_type('aresMedia'), + 'model', 'blocks', is_type('aresMediaMetadata'), + 'model', {dict}, any)) + if video_data['type'] == 'video': + entry = parse_model(model) + else: # legacyMedia: no duration, subtitles + block_id, entry = traverse_obj(model, ('blockId', {str})), None + media_data = traverse_obj(simorgh_data, ( + 'pageData', 'promo', 'media', + {lambda x: x if x['id'] == block_id else None})) + formats = traverse_obj(media_data, ('playlist', lambda _, v: v['url'], { + 'url': ('url', {url_or_none}), + 'ext': ('format', {str}), + 'tbr': ('bitrate', {k_int_or_none}), + }, {lambda u: u.get('url') and u})) + if formats: + entry = merge_dicts({ + 'id': block_id, + 'display_id': playlist_id, + 'formats': formats, + }, traverse_obj(simorgh_data, ('pageData', 'promo', { + 'description': ('summary', {str}), + })), traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ( + 'synopses', ('long', 'medium', 'short'), {str}, any), + 'timestamp': ('firstPublished', {k_int_or_none}), + }), + ) + done = True + if entry: + entries.append(entry) + if done: + break + if entries: + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), From b0593ecfa4ab2ca785ac66d29de873b0d2f74a9d Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Wed, 24 Apr 2024 11:56:51 -0700 Subject: [PATCH 14/23] flake --- yt_dlp/extractor/bbc.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index ca0998a3c..42cc07b46 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1397,20 +1397,23 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'tbr': ('bitrate', {k_int_or_none}), }, {lambda u: u.get('url') and u})) if formats: - entry = merge_dicts({ + entry = { 'id': block_id, 'display_id': playlist_id, 'formats': formats, - }, traverse_obj(simorgh_data, ('pageData', 'promo', { - 'description': ('summary', {str}), - })), traverse_obj(model, { + **traverse_obj(simorgh_data, ( + 'pageData', 'promo', { + 'description': ('summary', {str}), + } + )), + **traverse_obj(model, { 'title': ('title', {str}), 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), 'description': ( 'synopses', ('long', 'medium', 'short'), {str}, any), 'timestamp': ('firstPublished', {k_int_or_none}), }), - ) + } done = True if entry: entries.append(entry) From bb87bafce6e6f49dae3ec49fe01a72320d46c9b2 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Wed, 24 Apr 2024 15:30:42 -0700 Subject: [PATCH 15/23] Revert to use merge_dicts and fix flake This reverts commit b0593ecfa4ab2ca785ac66d29de873b0d2f74a9d. --- yt_dlp/extractor/bbc.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 42cc07b46..1addae237 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -17,6 +17,7 @@ from ..utils import ( int_or_none, join_nonempty, js_to_json, + merge_dicts, parse_duration, parse_iso8601, parse_qs, @@ -1397,23 +1398,20 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'tbr': ('bitrate', {k_int_or_none}), }, {lambda u: u.get('url') and u})) if formats: - entry = { + entry = merge_dicts({ 'id': block_id, 'display_id': playlist_id, 'formats': formats, - **traverse_obj(simorgh_data, ( - 'pageData', 'promo', { - 'description': ('summary', {str}), - } - )), - **traverse_obj(model, { - 'title': ('title', {str}), - 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), - 'description': ( - 'synopses', ('long', 'medium', 'short'), {str}, any), - 'timestamp': ('firstPublished', {k_int_or_none}), - }), - } + }, traverse_obj(simorgh_data, ('pageData', 'promo', { + 'description': ('summary', {str}), + })), traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ( + 'synopses', ('long', 'medium', 'short'), {str}, any), + 'timestamp': ('firstPublished', {k_int_or_none}), + }), + ) done = True if entry: entries.append(entry) From fd43ff21e28d5f50621ba17a81868d9bbadac6bd Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Thu, 25 Apr 2024 09:53:20 -0700 Subject: [PATCH 16/23] comments --- yt_dlp/extractor/bbc.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 1addae237..990640d2b 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1140,7 +1140,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE entry = self.url_result( f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE, video_id, url_transparent=True) - entry = { + entry.update({ **traverse_obj(morph_payload, ( 'body', 'content', 'article', { 'timestamp': ('dateTimeInfo', 'dateTime', {parse_iso8601}), @@ -1151,7 +1151,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'title': (('title', 'caption'), {str}, any), 'duration': ('duration', {parse_duration}), }), - } + }) if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id): return entry entries.append(entry) @@ -1367,8 +1367,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE elif name == 'article': for block in traverse_obj(resp, ( 'data', (None, ('content', 'model')), 'blocks', - lambda _, v: v.get('type') in {'media', 'video'}, - 'model', {dict})): + is_type('media', 'video'), 'model', {dict})): parse_media(block) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) From 388bc9c97c32df1b66b4379418787d922c569687 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Fri, 26 Apr 2024 12:47:09 -0700 Subject: [PATCH 17/23] suggestions --- yt_dlp/extractor/bbc.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 990640d2b..0d42daf26 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -17,7 +17,6 @@ from ..utils import ( int_or_none, join_nonempty, js_to_json, - merge_dicts, parse_duration, parse_iso8601, parse_qs, @@ -1391,26 +1390,27 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE media_data = traverse_obj(simorgh_data, ( 'pageData', 'promo', 'media', {lambda x: x if x['id'] == block_id else None})) - formats = traverse_obj(media_data, ('playlist', lambda _, v: v['url'], { + formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), { 'url': ('url', {url_or_none}), 'ext': ('format', {str}), 'tbr': ('bitrate', {k_int_or_none}), - }, {lambda u: u.get('url') and u})) + })) if formats: - entry = merge_dicts({ + entry = { 'id': block_id, 'display_id': playlist_id, 'formats': formats, - }, traverse_obj(simorgh_data, ('pageData', 'promo', { - 'description': ('summary', {str}), - })), traverse_obj(model, { - 'title': ('title', {str}), - 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), - 'description': ( + **traverse_obj(simorgh_data, ('pageData', 'promo', { + 'description': ('summary', {str}), + })), + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ( 'synopses', ('long', 'medium', 'short'), {str}, any), - 'timestamp': ('firstPublished', {k_int_or_none}), - }), - ) + 'timestamp': ('firstPublished', {k_int_or_none}), + }), + } done = True if entry: entries.append(entry) From 8d78a0f1182b15efa51c9f15299905c9cc501533 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Fri, 26 Apr 2024 12:55:22 -0700 Subject: [PATCH 18/23] flake --- yt_dlp/extractor/bbc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 0d42daf26..e81b52d36 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1407,7 +1407,8 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'title': ('title', {str}), 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), 'description': ( - 'synopses', ('long', 'medium', 'short'), {str}, any), + 'synopses', ('long', 'medium', 'short'), {str}, any + ), 'timestamp': ('firstPublished', {k_int_or_none}), }), } From 221a3c6dbaf6dfe33583ddc0abbef4eae4744d8a Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 27 Apr 2024 12:36:52 +0530 Subject: [PATCH 19/23] Update yt_dlp/extractor/bbc.py Co-authored-by: dirkf --- yt_dlp/extractor/bbc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index e81b52d36..5464af304 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1359,8 +1359,8 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'description': strip_or_none(item_desc), }) - for resp in (initial_data.get('data') or {}).values(): - name = resp.get('name') + for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])): + name = resp['name'] if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': From 706272edd2bd837d996b7179e547d7403d89f829 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 27 Apr 2024 12:53:01 +0530 Subject: [PATCH 20/23] style nitpick --- yt_dlp/extractor/bbc.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 5464af304..4fcbd2bfb 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1291,8 +1291,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE **traverse_obj(model, { 'title': ('title', {str}), 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), - 'description': ( - 'synopses', ('long', 'medium', 'short'), {str}, any), + 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), 'duration': ('versions', 0, 'duration', {int}), 'timestamp': ('versions', 0, 'availableFrom', {k_int_or_none}), }) @@ -1406,9 +1405,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE **traverse_obj(model, { 'title': ('title', {str}), 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), - 'description': ( - 'synopses', ('long', 'medium', 'short'), {str}, any - ), + 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), 'timestamp': ('firstPublished', {k_int_or_none}), }), } From 604ada1987f81a163e32e3f0e4132c56622660c3 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Mon, 29 Apr 2024 11:23:05 -0700 Subject: [PATCH 21/23] Simplifications --- yt_dlp/extractor/bbc.py | 50 +++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 4fcbd2bfb..a5b232fce 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -670,6 +670,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', 'timestamp': 1434713142, 'upload_date': '20150619', + 'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg', }, 'params': { 'skip_download': True, @@ -695,10 +696,11 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'id': 'p02q6gc4', 'ext': 'mp4', 'title': 'Tasting the spice of life in Jaffna', - 'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{149} aftertaste\.$', - 'timestamp': 1437935638, - 'upload_date': '20150726', + 'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$', + 'timestamp': 1646058397, + 'upload_date': '20220228', 'duration': 255, + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg', }, }, { # single video story without digitalData @@ -736,7 +738,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$', 'timestamp': 1437750175, 'upload_date': '20150724', - 'thumbnail': r're:https://(?:[^/]+/)+/media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png', + 'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png', 'duration': 140, }, }, { @@ -797,10 +799,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'id': 'p0b779gc', 'ext': 'mp4', 'title': 'Why France is making this woman a national hero', - 'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{291} Casseville$', + 'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.', 'thumbnail': r're:https?://.+/.+\.jpg', - 'timestamp': 1638230731, - 'upload_date': '20211130', + 'timestamp': 1638215626, + 'upload_date': '20211129', 'duration': 125, }, }, { @@ -864,7 +866,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'description': 'md5:fad74b31da60d83b8265954ee42d85b4', 'duration': 235, 'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)', - 'upload_date': '20190604', + 'upload_date': '20220223', 'categories': ['Psychology'], }, }, { @@ -1140,11 +1142,9 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE, video_id, url_transparent=True) entry.update({ - **traverse_obj(morph_payload, ( - 'body', 'content', 'article', { - 'timestamp': ('dateTimeInfo', 'dateTime', {parse_iso8601}), - } - )), + 'timestamp': traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601}) + ), **traverse_obj(video_data, { 'thumbnail': (('iChefImage', 'image'), {url_or_none}, any), 'title': (('title', 'caption'), {str}, any), @@ -1184,16 +1184,14 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'series': ('titles', 'primary', {str}), }), 'subtitles': subtitles, - **traverse_obj(preload_state, { - 'chapters': ( - 'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), { - 'title': ('titles', {lambda x: join_nonempty( - 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), - 'start_time': ('offset', 'start', {float_or_none}), - 'end_time': ('offset', 'end', {float_or_none}), - } - ) - }), + 'chapters': traverse_obj(preload_state, ( + 'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), { + 'title': ('titles', {lambda x: join_nonempty( + 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), + 'start_time': ('offset', 'start', {float_or_none}), + 'end_time': ('offset', 'end', {float_or_none}), + }) + ), } # PWA_PRELOADED_STATE with article video asset @@ -1399,9 +1397,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'id': block_id, 'display_id': playlist_id, 'formats': formats, - **traverse_obj(simorgh_data, ('pageData', 'promo', { - 'description': ('summary', {str}), - })), + 'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})), **traverse_obj(model, { 'title': ('title', {str}), 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), @@ -1425,7 +1421,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE # US accessed article with single embedded video (e.g. # https://www.bbc.com/news/uk-68546268) - next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), + next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default='{}'), ('props', 'pageProps', 'page')) model = traverse_obj(next_data, ( ..., 'contents', is_type('video'), From de108ee3f1e022ea56238861ba4024e55b40c562 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 30 Apr 2024 16:47:36 +0530 Subject: [PATCH 22/23] Update yt_dlp/extractor/bbc.py --- yt_dlp/extractor/bbc.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index a5b232fce..7525c3530 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1220,9 +1220,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE })), 'formats': formats, 'subtitles': subtitles, - **traverse_obj(article, { - 'timestamp': ('displayDate', {parse_iso8601}), - }), + 'timestamp'; traverse_obj(article, ('displayDate', {parse_iso8601})), } else: return self.url_result( From 6608d38afed8652bb2980692e2a3e96b950911bc Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 30 Apr 2024 16:50:46 +0530 Subject: [PATCH 23/23] oops --- yt_dlp/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 7525c3530..f08d12857 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1220,7 +1220,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE })), 'formats': formats, 'subtitles': subtitles, - 'timestamp'; traverse_obj(article, ('displayDate', {parse_iso8601})), + 'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})), } else: return self.url_result(