From a83333c4328591c279a27dd0ec4c7c5addcc411f Mon Sep 17 00:00:00 2001 From: Teemu Ikonen <tpikonen@gmail.com> Date: Mon, 3 Oct 2022 00:23:48 +0300 Subject: [PATCH] [extractor/iltalehti] Add extractor (#5117) Authored by: tpikonen --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/iltalehti.py | 51 +++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 yt_dlp/extractor/iltalehti.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4d94d3563..f104b3e35 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -718,6 +718,7 @@ from .iheart import ( IHeartRadioIE, IHeartRadioPodcastIE, ) +from .iltalehti import IltalehtiIE from .imdb import ( ImdbIE, ImdbListIE diff --git a/yt_dlp/extractor/iltalehti.py b/yt_dlp/extractor/iltalehti.py new file mode 100644 index 000000000..a40307aed --- /dev/null +++ b/yt_dlp/extractor/iltalehti.py @@ -0,0 +1,51 @@ +from .common import InfoExtractor +from ..utils import js_to_json, traverse_obj + + +class IltalehtiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?iltalehti\.fi/[^/?#]+/a/(?P<id>[^/?#])' + _TESTS = [ + # jwplatform embed main_media + { + 'url': 'https://www.iltalehti.fi/ulkomaat/a/9fbd067f-94e4-46cd-8748-9d958eb4dae2', + 'md5': 'af12d42c539f1f49f0b62d231fe72dcd', + 'info_dict': { + 'id': 'gYjjaf1L', + 'ext': 'mp4', + 'title': 'Sensuroimaton Päivärinta, jakso 227: Vieraana Suomen Venäjän ex-suurlähettiläs René Nyberg ja Kenraalimajuri evp Pekka Toveri', + 'description': '', + 'upload_date': '20220928', + 'timestamp': 1664360878, + 'duration': 2089, + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, + # jwplatform embed body + { + 'url': 'https://www.iltalehti.fi/politiikka/a/1ce49d85-1670-428b-8db8-d2479b9950a4', + 'md5': '9e50334b8f8330ce8828b567a82a3c65', + 'info_dict': { + 'id': '18R6zkLi', + 'ext': 'mp4', + 'title': 'Pekka Toverin arvio: Näin Nord Stream -kaasuputken räjäyttäminen on saatettu toteuttaa', + 'description': 'md5:3d1302c9e17e7ffd564143ff58f8de35', + 'upload_date': '20220929', + 'timestamp': 1664435867, + 'duration': 165.0, + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, + ] + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + info = self._search_json( + r'<script>\s*window.App\s*=\s*', webpage, 'json', article_id, + transform_source=js_to_json) + props = traverse_obj(info, ( + 'state', 'articles', ..., 'items', (('main_media', 'properties'), ('body', ..., 'properties')))) + video_ids = traverse_obj(props, (lambda _, v: v['provider'] == 'jwplayer', 'id')) + return self.playlist_from_matches( + video_ids, article_id, ie='JWPlatform', getter=lambda id: f'jwplatform:{id}', + title=traverse_obj(info, ('state', 'articles', ..., 'items', 'canonical_title'), get_all=False))