[duboku] add playlist extractor

2024-12-23 02:26:37 -05:00 · 2020-08-29 15:04:16 +08:00 · 2020-08-29 15:04:16 +08:00 · de4144a4ae
commit de4144a4ae
parent 503406d4bc
2 changed files with 97 additions and 1 deletions
--- a/youtube_dl/extractor/duboku.py
+++ b/youtube_dl/extractor/duboku.py
@ -4,10 +4,49 @@ from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..compat import compat_urlparse
 from ..utils import *
 def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
    """Return the content of the tag with the specified attribute in the passed HTML document"""
    if tag is None:
        tag = '[a-zA-Z0-9:._-]+'
    if attribute is None:
        attribute = ''
    else:
        attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
    if value is None:
        value = ''
    else:
        value = re.escape(value) if escape_value else value
        value = '=[\'"]?(?P<value>%s)[\'"]?' % value
    retlist = []
    for m in re.finditer(r'''(?xs)
        <(?P<tag>%s)
         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
         %s%s
         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
        \s*>
        (?P<content>.*?)
        </\1>
    ''' % (tag, attribute, value), html):
        retlist.append(m)
    return retlist
 def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
    retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
    return retval[0] if retval else None
 class DubokuIE(InfoExtractor):
    IE_NAME = 'duboku'
    IE_DESC = 'www.duboku.co'
    _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9\-]+)\.html.*'
    _TESTS = [{
        'url': 'https://www.duboku.co/vodplay/1575-1-1.html',
@ -90,3 +129,57 @@ class DubokuIE(InfoExtractor):
            'episode_id': episode_id,
            'formats': formats,
        }
 class DubokuPlaylistIE(InfoExtractor):
    IE_NAME = 'duboku:list'
    IE_DESC = 'www.duboku.co entire series'
    _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*'
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        if mobj is None:
            raise ExtractorError('Invalid URL: %s' % url)
        series_id = mobj.group('id')
        fragment = compat_urlparse.urlparse(url).fragment
        webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id
        webpage_html = self._download_webpage(webpage_url, series_id)
        # extract title
        title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
        title = unescapeHTML(title.group('content')) if title else None
        if not title:
            title = self._html_search_meta('keywords', webpage_html)
        if not title:
            title = _get_element_by_tag_and_attrib(webpage_html, 'title')
            title = unescapeHTML(title.group('content')) if title else None
        # extract playlists
        playlists = {}
        for div in _get_elements_by_tag_and_attrib(
                webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
            playlist_id = div.group('value')
            playlist = []
            for a in _get_elements_by_tag_and_attrib(
                    div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
                playlist.append({
                    'href': unescapeHTML(a.group('value')),
                    'title': unescapeHTML(a.group('content'))
                })
            playlists[playlist_id] = playlist
        # select the specified playlist if url fragment exists
        playlist = playlists.get(fragment) if fragment else next(iter(playlists.values()))
        if not playlist:
            raise ExtractorError(
                'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
        # return url results
        return self.playlist_result([
            self.url_result(
                'https://www.duboku.co' + x['href'], video_title=x.get('title'))
            for x in playlist], series_id, title)
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -282,7 +282,10 @@ from .drtv import (
 )
 from .dtube import DTubeIE
 from .dvtv import DVTVIE
-from .duboku import DubokuIE
+from .duboku import (
    DubokuIE,
    DubokuPlaylistIE
 )
 from .dumpert import DumpertIE
 from .defense import DefenseGouvFrIE
 from .discovery import DiscoveryIE