diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 014324439f..6f1d6ef99e 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -26,9 +26,9 @@ tests = [ # 85 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<", ".>/?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ0q876543r1mnbvcx9asdfghjklpoiuyt2"), - # 84 + # 84 - vflh9ybst 2013/08/23 (sporadic) ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", - "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ09876543q1mnbvcxzasdfghjklpoiuew2"), + "yuioplkjhgfdsazxcvbnm1234567890QWERrYUIOPLKqHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<"), # 83 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"), diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b4db8f0bf7..f71ae2713b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -29,6 +29,7 @@ from .gametrailers import GametrailersIE from .generic import GenericIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE +from .hark import HarkIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .hypem import HypemIE @@ -57,6 +58,7 @@ from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE +from .ro220 import Ro220IE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE from .sina import SinaIE @@ -116,12 +118,14 @@ _ALL_CLASSES = [ ] _ALL_CLASSES.append(GenericIE) + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. """ return [klass() for klass in _ALL_CLASSES] + def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" return globals()[ie_name+'IE'] diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py index 4c8a8af091..dc3a8d47d1 100644 --- a/youtube_dl/extractor/c56.py +++ b/youtube_dl/extractor/c56.py @@ -12,8 +12,8 @@ class C56IE(InfoExtractor): _TEST ={ u'url': u'http://www.56.com/u39/v_OTM0NDA3MTY.html', - u'file': u'93440716.mp4', - u'md5': u'9dc07b5c8e978112a6441f9e75d2b59e', + u'file': u'93440716.flv', + u'md5': u'e59995ac63d0457783ea05f93f12a866', u'info_dict': { u'title': u'网事知多少 第32期:车怒', }, diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index fa8c630d05..1ea449ca82 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -21,7 +21,7 @@ class DailymotionIE(InfoExtractor): u'file': u'x33vw9.mp4', u'md5': u'392c4b85a60a90dc4792da41ce3144eb', u'info_dict': { - u"uploader": u"Alex and Van .", + u"uploader": u"Amphora Alex and Van .", u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" } } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index da016f7ee8..d034a11bbc 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -7,12 +7,14 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_error, compat_urllib_parse, + compat_urllib_parse_urlparse, compat_urllib_request, ExtractorError, ) from .brightcove import BrightcoveIE + class GenericIE(InfoExtractor): IE_DESC = u'Generic downloader that works on some sites' _VALID_URL = r'.*' @@ -23,7 +25,7 @@ class GenericIE(InfoExtractor): u'file': u'13601338388002.mp4', u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89', u'info_dict': { - u"uploader": u"www.hodiho.fr", + u"uploader": u"www.hodiho.fr", u"title": u"R\u00e9gis plante sa Jeep" } }, @@ -124,7 +126,7 @@ class GenericIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) self.report_extraction(video_id) - # Look for BrigthCove: + # Look for BrightCove: m_brightcove = re.search(r'', webpage, re.DOTALL) if m_brightcove is not None: self.to_screen(u'Brightcove video detected.') @@ -161,6 +163,10 @@ class GenericIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_url = compat_urllib_parse.unquote(mobj.group(1)) + if video_url.startswith('//'): + video_url = compat_urllib_parse_urlparse(url).scheme + ':' + video_url + if '://' not in video_url: + video_url = url + ('' if url.endswith('/') else '/') + video_url video_id = os.path.basename(video_url) # here's a fun little line of code for you: diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py new file mode 100644 index 0000000000..ab0a69697e --- /dev/null +++ b/youtube_dl/extractor/hark.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + +class HarkIE(InfoExtractor): + _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+' + _TEST = { + u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', + u'file': u'mmbzyhkgny.mp3', + u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', + u'info_dict': { + u"title": u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' On May 23, 2013 ", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + embed_url = "http://www.hark.com/clips/%s/homepage_embed" %(video_id) + webpage = self._download_webpage(embed_url, video_id) + + final_url = self._search_regex(r'src="(.+?).mp3"', + webpage, 'video url')+'.mp3' + title = self._html_search_regex(r'(.+?)', + webpage, 'video title').replace(' Sound Clip and Quote - Hark','').replace( + 'Sound Clip , Quote, MP3, and Ringtone - Hark','') + + return {'id': video_id, + 'url' : final_url, + 'title': title, + 'ext': determine_ext(final_url), + } diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py new file mode 100644 index 0000000000..c32f64d997 --- /dev/null +++ b/youtube_dl/extractor/ro220.py @@ -0,0 +1,42 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + compat_parse_qs, +) + + +class Ro220IE(InfoExtractor): + IE_NAME = '220.ro' + _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P[^/]+)/(?P[^/]+)/(?P[^/]+)' + _TEST = { + u"url": u"http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/", + u'file': u'LYV6doKo7f.mp4', + u'md5': u'03af18b73a07b4088753930db7a34add', + u'info_dict': { + u"title": u"Luati-le Banii sez 4 ep 1", + u"description": u"Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + flashVars_str = self._search_regex( + r'(?Prtl(?:(?P2)|-)now\.rtl(?(is_rtl2)2|)\.de/|(?:www\.)?voxnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' + """Information Extractor for RTL NOW, RTL2 NOW, SUPER RTL NOW and VOX NOW""" + _VALID_URL = r'(?:http://)?(?P(?Prtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?superrtlnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' _TESTS = [{ u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', u'file': u'90419.flv', @@ -48,6 +48,19 @@ class RTLnowIE(InfoExtractor): u'params': { u'skip_download': True, }, + }, + { + u'url': u'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1', + u'file': u'99205.flv', + u'info_dict': { + u'upload_date': u'20080928', + u'title': u'Medicopter 117 - Angst!', + u'description': u'Angst!', + u'thumbnail': u'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg' + }, + u'params': { + u'skip_download': True, + }, }] def _real_extract(self,url): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e4987b2b39..af01c9da02 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -427,7 +427,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif len(s) == 85: return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27] elif len(s) == 84: - return s[83:27:-1] + s[0] + s[26:5:-1] + s[2:0:-1] + s[27] + return s[5:40] + s[3] + s[41:48] + s[0] + s[49:84] elif len(s) == 83: return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0] elif len(s) == 82: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 52cfb8a6d1..ab1049cc0d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -476,7 +476,7 @@ def formatSeconds(secs): def make_HTTPS_handler(opts): if sys.version_info < (3,2): # Python's 2.x handler is very simplistic - return compat_urllib_request.HTTPSHandler() + return YoutubeDLHandlerHTTPS() else: import ssl context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) @@ -485,7 +485,7 @@ def make_HTTPS_handler(opts): context.verify_mode = (ssl.CERT_NONE if opts.no_check_certificate else ssl.CERT_REQUIRED) - return compat_urllib_request.HTTPSHandler(context=context) + return YoutubeDLHandlerHTTPS(context=context) class ExtractorError(Exception): """Error during info extraction.""" @@ -569,7 +569,8 @@ class ContentTooShortError(Exception): self.downloaded = downloaded self.expected = expected -class YoutubeDLHandler(compat_urllib_request.HTTPHandler): + +class YoutubeDLHandler_Template: # Old-style class, like HTTPHandler """Handler for HTTP requests and responses. This class, when installed with an OpenerDirector, automatically adds @@ -602,8 +603,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): ret.code = code return ret - def http_request(self, req): - for h,v in std_headers.items(): + def _http_request(self, req): + for h, v in std_headers.items(): if h in req.headers: del req.headers[h] req.add_header(h, v) @@ -618,7 +619,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): del req.headers['Youtubedl-user-agent'] return req - def http_response(self, req, resp): + def _http_response(self, req, resp): old_resp = resp # gzip if resp.headers.get('Content-encoding', '') == 'gzip': @@ -632,8 +633,16 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): resp.msg = old_resp.msg return resp - https_request = http_request - https_response = http_response + +class YoutubeDLHandler(YoutubeDLHandler_Template, compat_urllib_request.HTTPHandler): + http_request = YoutubeDLHandler_Template._http_request + http_response = YoutubeDLHandler_Template._http_response + + +class YoutubeDLHandlerHTTPS(YoutubeDLHandler_Template, compat_urllib_request.HTTPSHandler): + https_request = YoutubeDLHandler_Template._http_request + https_response = YoutubeDLHandler_Template._http_response + def unified_strdate(date_str): """Return a string with the date in the format YYYYMMDD"""