[extractor/common] Recognize src attribute from HTML5 media elements (#3899)

Authored by: Lesmiscore
2024-12-22 10:07:43 -05:00 · 2022-05-29 22:48:04 +09:00 · 2022-05-29 22:48:04 +09:00 · 222a230871
commit 222a230871
parent ee27297f82
2 changed files with 23 additions and 2 deletions
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@ -502,6 +502,24 @@ class TestInfoExtractor(unittest.TestCase):
                }],
            })

+        # from https://0000.studio/
+        # with type attribute but without extension in URL
+        expect_dict(
+            self,
+            self.ie._parse_html5_media_entries(
+                'https://0000.studio',
+                r'''
+                <video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92"
+                    controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain">
+                </video>
+                ''', None)[0],
+            {
+                'formats': [{
+                    'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92',
+                    'ext': 'mp4',
+                }],
+            })
+
    def test_extract_jwplayer_data_realworld(self):
        # from http://www.suffolk.edu/sjc/
        expect_dict(
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -3197,7 +3197,8 @@ class InfoExtractor:
                return f
            return {}

-        def _media_formats(src, cur_media_type, type_info={}):
+        def _media_formats(src, cur_media_type, type_info=None):
+            type_info = type_info or {}
            full_url = absolute_url(src)
            ext = type_info.get('ext') or determine_ext(full_url)
            if ext == 'm3u8':
@ -3215,6 +3216,7 @@ class InfoExtractor:
                formats = [{
                    'url': full_url,
                    'vcodec': 'none' if cur_media_type == 'audio' else None,
+                    'ext': ext,
                }]
            return is_plain_url, formats

@ -3241,7 +3243,8 @@ class InfoExtractor:
            media_attributes = extract_attributes(media_tag)
            src = strip_or_none(media_attributes.get('src'))
            if src:
-                _, formats = _media_formats(src, media_type)
+                f = parse_content_type(media_attributes.get('type'))
+                _, formats = _media_formats(src, media_type, f)
                media_info['formats'].extend(formats)
            media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
            if media_content: