From 5fccabac27ca3c1165ade1b0df6fbadc24258dc2 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 21 Sep 2023 19:37:58 +0200 Subject: [PATCH] [ie/rbgtum] Fix extraction and support new URL format (#7690) Authored by: simon300000 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/rbgtum.py | 79 ++++++++++++++++++++++++++------- 2 files changed, 65 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 632d6720e1..9cda06d8fa 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1601,6 +1601,7 @@ from .rbmaradio import RBMARadioIE from .rbgtum import ( RbgTumIE, RbgTumCourseIE, + RbgTumNewCourseIE, ) from .rcs import ( RCSIE, diff --git a/yt_dlp/extractor/rbgtum.py b/yt_dlp/extractor/rbgtum.py index 47649cfc58..c8a331f3ee 100644 --- a/yt_dlp/extractor/rbgtum.py +++ b/yt_dlp/extractor/rbgtum.py @@ -1,10 +1,11 @@ import re from .common import InfoExtractor +from ..utils import parse_qs, remove_start, traverse_obj, ExtractorError class RbgTumIE(InfoExtractor): - _VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P.+)' + _VALID_URL = r'https://(?:live\.rbg\.tum\.de|tum\.live)/w/(?P[^?#]+)' _TESTS = [{ # Combined view 'url': 'https://live.rbg.tum.de/w/cpp/22128', @@ -35,16 +36,18 @@ class RbgTumIE(InfoExtractor): 'title': 'Fachschaftsvollversammlung', 'series': 'Fachschaftsvollversammlung Informatik', } + }, { + 'url': 'https://tum.live/w/linalginfo/27102', + 'only_matching': True, }, ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8') - lecture_title = self._html_search_regex(r'(?si)(.*)', webpage, 'title') - lecture_series_title = self._html_search_regex( - r'(?s)]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?', webpage, 'series') + m3u8 = self._html_search_regex(r'"(https://[^"]+\.m3u8[^"]*)', webpage, 'm3u8') + lecture_title = self._html_search_regex(r']*>([^<]+)', webpage, 'title', fatal=False) + lecture_series_title = remove_start(self._html_extract_title(webpage), 'TUM-Live | ') formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') @@ -57,9 +60,9 @@ class RbgTumIE(InfoExtractor): class RbgTumCourseIE(InfoExtractor): - _VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P.+)' + _VALID_URL = r'https://(?P(?:live\.rbg\.tum\.de|tum\.live))/old/course/(?P(?P\d+)/(?P\w+)/(?P[^/?#]+))' _TESTS = [{ - 'url': 'https://live.rbg.tum.de/course/2022/S/fpv', + 'url': 'https://live.rbg.tum.de/old/course/2022/S/fpv', 'info_dict': { 'title': 'Funktionale Programmierung und Verifikation (IN0003)', 'id': '2022/S/fpv', @@ -69,7 +72,7 @@ class RbgTumCourseIE(InfoExtractor): }, 'playlist_count': 13, }, { - 'url': 'https://live.rbg.tum.de/course/2022/W/set', + 'url': 'https://live.rbg.tum.de/old/course/2022/W/set', 'info_dict': { 'title': 'SET FSMPIC', 'id': '2022/W/set', @@ -78,16 +81,62 @@ class RbgTumCourseIE(InfoExtractor): 'noplaylist': False, }, 'playlist_count': 6, + }, { + 'url': 'https://tum.live/old/course/2023/S/linalginfo', + 'only_matching': True, }, ] def _real_extract(self, url): - course_id = self._match_id(url) - webpage = self._download_webpage(url, course_id) + course_id, hostname, year, term, slug = self._match_valid_url(url).group('id', 'hostname', 'year', 'term', 'slug') + meta = self._download_json( + f'https://{hostname}/api/courses/{slug}/', course_id, fatal=False, + query={'year': year, 'term': term}) or {} + lecture_series_title = meta.get('Name') + lectures = [self.url_result(f'https://{hostname}/w/{slug}/{stream_id}', RbgTumIE) + for stream_id in traverse_obj(meta, ('Streams', ..., 'ID'))] - lecture_series_title = self._html_search_regex(r'(?si)(.*)', webpage, 'title') + if not lectures: + webpage = self._download_webpage(url, course_id) + lecture_series_title = remove_start(self._html_extract_title(webpage), 'TUM-Live | ') + lectures = [self.url_result(f'https://{hostname}{lecture_path}', RbgTumIE) + for lecture_path in re.findall(r'href="(/w/[^/"]+/[^/"]+)"', webpage)] - lecture_urls = [] - for lecture_url in re.findall(r'(?i)href="/w/(.+)(?(?:live\.rbg\.tum\.de|tum\.live))/\?' + _TESTS = [{ + 'url': 'https://live.rbg.tum.de/?year=2022&term=S&slug=fpv&view=3', + 'info_dict': { + 'title': 'Funktionale Programmierung und Verifikation (IN0003)', + 'id': '2022/S/fpv', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 13, + }, { + 'url': 'https://live.rbg.tum.de/?year=2022&term=W&slug=set&view=3', + 'info_dict': { + 'title': 'SET FSMPIC', + 'id': '2022/W/set', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 6, + }, { + 'url': 'https://tum.live/?year=2023&term=S&slug=linalginfo&view=3', + 'only_matching': True, + }] + + def _real_extract(self, url): + query = parse_qs(url) + errors = [key for key in ('year', 'term', 'slug') if not query.get(key)] + if errors: + raise ExtractorError(f'Input URL is missing query parameters: {", ".join(errors)}') + year, term, slug = query['year'][0], query['term'][0], query['slug'][0] + hostname = self._match_valid_url(url).group('hostname') + + return self.url_result(f'https://{hostname}/old/course/{year}/{term}/{slug}', RbgTumCourseIE)