From ac3e9394e76c0e8baeff1bc77eb67fa184ceb81c Mon Sep 17 00:00:00 2001 From: Anna Bernardi Date: Thu, 6 Jun 2013 13:27:27 +0200 Subject: [PATCH 01/12] Implement search_regex from #847 --- youtube_dl/InfoExtractors.py | 633 ++++++++++++++--------------------- 1 file changed, 252 insertions(+), 381 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index b40edf5fbb..4d13c17e44 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -191,6 +191,20 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info + def _search_regex(self, pattern, text, name, fatal=True, flags=0): + """Extract a field from some text based on regex""" + mobj = re.search(pattern, text, flags) + if mobj is None and fatal: + raise ExtractorError(u'Unable to extract %s; ' + u'please report this issue on GitHub.' % name) + elif mobj is None: + self._downloader.report_warning(u'unable to extract %s; ' + u'please report this issue on GitHub.' % name) + return None + else: + # return the first matched group + return next(g for g in mobj.groups() if g is not None) + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. @@ -964,18 +978,13 @@ class PhotobucketIE(InfoExtractor): }] # We try looking in other parts of the webpage - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - - video_url = mediaURL + video_url = self._search_regex(r'', + webpage, u'video URL') mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) if mobj is None: raise ExtractorError(u'Unable to extract title') video_title = mobj.group(1).decode('utf-8') - video_uploader = mobj.group(2).decode('utf-8') return [{ @@ -1803,10 +1812,7 @@ class DepositFilesIE(InfoExtractor): file_extension = os.path.splitext(file_url)[1][1:] # Search for file title - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - file_title = mobj.group(1).decode('utf-8') + file_title = self._search_regex(r'', webpage, u'title') return [{ 'id': file_id.decode('utf-8'), @@ -1900,10 +1906,9 @@ class FacebookIE(InfoExtractor): video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] - m = re.search('

([^<]+)

', webpage) - if not m: - raise ExtractorError(u'Cannot find title in webpage') - video_title = unescapeHTML(m.group(1)) + video_title = self._search_regex('

([^<]+)

', + webpage, u'title') + video_title = unescapeHTML(video_title) info = { 'id': video_id, @@ -2065,15 +2070,10 @@ class MyVideoIE(InfoExtractor): self.report_extraction(video_id) video_url = mobj.group(1) + '.flv' - mobj = re.search('([^<]+)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._search_regex('([^<]+)', + webpage, u'title') - mobj = re.search('[.](.+?)$', video_url) - if mobj is None: - raise ExtractorError(u'Unable to extract extention') - video_ext = mobj.group(1) + video_ext = self._search_regex('[.](.+?)$', video_url, u'extension') return [{ 'id': video_id, @@ -2121,25 +2121,23 @@ class MyVideoIE(InfoExtractor): # extracting infos self.report_extraction(video_id) + video_url = None mobj = re.search('connectionurl=\'(.*?)\'', dec_data) - if mobj is None: - raise ExtractorError(u'unable to extract rtmpurl') - video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) - if 'myvideo2flash' in video_rtmpurl: - self._downloader.report_warning(u'forcing RTMPT ...') - video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://') + if mobj: + video_url = compat_urllib_parse.unquote(mobj.group(1)) + if 'myvideo2flash' in video_url: + self._downloader.report_warning(u'forcing RTMPT ...') + video_url = video_url.replace('rtmpe://', 'rtmpt://') - # extract non rtmp videos - if (video_rtmpurl is None) or (video_rtmpurl == ''): + if not video_url: + # extract non rtmp videos mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data) if mobj is None: raise ExtractorError(u'unable to extract url') - video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2)) + video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2)) - mobj = re.search('source=\'(.*?)\'', dec_data) - if mobj is None: - raise ExtractorError(u'unable to extract swfobj') - video_file = compat_urllib_parse.unquote(mobj.group(1)) + video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file') + video_file = compat_urllib_parse.unquote(video_file) if not video_file.endswith('f4m'): ppath, prefix = video_file.split('.') @@ -2151,20 +2149,16 @@ class MyVideoIE(InfoExtractor): video_filepath + video_file ).replace('.f4m', '.m3u8') - mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage) - if mobj is None: - raise ExtractorError(u'unable to extract swfobj') - video_swfobj = compat_urllib_parse.unquote(mobj.group(1)) + video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj') + video_swfobj = compat_urllib_parse.unquote(video_swfobj) - mobj = re.search("(.*?)", webpage) - if mobj is None: - raise ExtractorError(u'unable to extract title') - video_title = mobj.group(1) + video_title = self._search_regex("(.*?)", + webpage, u'title') return [{ 'id': video_id, - 'url': video_rtmpurl, - 'tc_url': video_rtmpurl, + 'url': video_url, + 'tc_url': video_url, 'uploader': None, 'upload_date': None, 'title': video_title, @@ -2175,6 +2169,7 @@ class MyVideoIE(InfoExtractor): 'player_url': video_swfobj, }] + class ComedyCentralIE(InfoExtractor): """Information extractor for The Daily Show and Colbert Report """ @@ -2357,16 +2352,22 @@ class EscapistIE(InfoExtractor): videoId = mobj.group('episode') self.report_extraction(showName) - webPage = self._download_webpage(url, showName) + webpage = self._download_webpage(url, showName) - descMatch = re.search('(.*?)\s+-\s+XVID', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video title') - video_title = mobj.group(1) - + video_title = self._search_regex(r'(.*?)\s+-\s+XVID', + webpage, u'title') # Extract video thumbnail - mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video thumbnail') - video_thumbnail = mobj.group(0) + video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', + webpage, u'thumbnail', fatal=False) info = { 'id': video_id, @@ -2652,16 +2644,12 @@ class InfoQIE(InfoExtractor): video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id # Extract title - mobj = re.search(r'contentTitle = "(.*?)";', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video title') - video_title = mobj.group(1) + video_title = self._search_regex(r'contentTitle = "(.*?)";', + webpage, u'title') # Extract description - video_description = u'No description available.' - mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage) - if mobj is not None: - video_description = mobj.group(1) + video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', + webpage, u'description', fatal=False) video_filename = video_url.split('/')[-1] video_id, extension = video_filename.split('.') @@ -2832,15 +2820,16 @@ class StanfordOpenClassroomIE(InfoExtractor): note='Downloading course info page', errnote='Unable to download course info page') + # TODO: implement default_value in search_regex m = re.search('<h1>([^<]+)</h1>', coursepage) if m: info['title'] = unescapeHTML(m.group(1)) else: info['title'] = info['id'] - m = re.search('<description>([^<]+)</description>', coursepage) - if m: - info['description'] = unescapeHTML(m.group(1)) + info['description'] = self._search_regex('<description>([^<]+)</description>', + coursepage, u'description', fatal=False) + if info['description']: info['description'] = unescapeHTML(info['description']) links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) info['list'] = [ @@ -2901,25 +2890,19 @@ class MTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract song name') - song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1')) - mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract performer') - performer = unescapeHTML(mobj.group(1).decode('iso-8859-1')) - video_title = performer + ' - ' + song_name + song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', + webpage, u'song name', fatal=False) + if song_name: song_name = unescapeHTML(song_name) - mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to mtvn_uri') - mtvn_uri = mobj.group(1) + video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', + webpage, u'title') + video_title = unescapeHTML(video_title) - mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract content id') - content_id = mobj.group(1) + mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', + webpage, u'mtvn_uri', fatal=False) + + content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', + webpage, u'content id', fatal=False) videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri self.report_extraction(video_id) @@ -3067,20 +3050,15 @@ class XNXXIE(InfoExtractor): # Get webpage content webpage = self._download_webpage(url, video_id) - result = re.search(self.VIDEO_URL_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video url') - video_url = compat_urllib_parse.unquote(result.group(1)) + video_url = self._search_regex(self.VIDEO_URL_RE, + webpage, u'video URL') + video_url = compat_urllib_parse.unquote(video_url) - result = re.search(self.VIDEO_TITLE_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - video_title = result.group(1) + video_title = self._search_regex(self.VIDEO_TITLE_RE, + webpage, u'title') - result = re.search(self.VIDEO_THUMB_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video thumbnail') - video_thumbnail = result.group(1) + video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE, + webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, @@ -3100,26 +3078,6 @@ class GooglePlusIE(InfoExtractor): _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)' IE_NAME = u'plus.google' - def report_extract_entry(self, url): - """Report downloading extry""" - self.to_screen(u'Downloading entry: %s' % url) - - def report_date(self, upload_date): - """Report downloading extry""" - self.to_screen(u'Entry date: %s' % upload_date) - - def report_uploader(self, uploader): - """Report downloading extry""" - self.to_screen(u'Uploader: %s' % uploader) - - def report_title(self, video_title): - """Report downloading extry""" - self.to_screen(u'Title: %s' % video_title) - - def report_extract_vid_page(self, video_page): - """Report information extraction.""" - self.to_screen(u'Extracting video page: %s' % video_page) - def _real_extract(self, url): # Extract id from URL mobj = re.match(self._VALID_URL, url) @@ -3132,47 +3090,35 @@ class GooglePlusIE(InfoExtractor): video_extension = 'flv' # Step 1, Retrieve post webpage to extract further information - self.report_extract_entry(post_url) webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage') + self.report_extraction(video_id) + # Extract update date - upload_date = None - pattern = 'title="Timestamp">(.*?)</a>' - mobj = re.search(pattern, webpage) - if mobj: - upload_date = mobj.group(1) + upload_date = self._search_regex('title="Timestamp">(.*?)</a>', + webpage, u'upload date', fatal=False) + if upload_date: # Convert timestring to a format suitable for filename upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d") upload_date = upload_date.strftime('%Y%m%d') - self.report_date(upload_date) # Extract uploader - uploader = None - pattern = r'rel\="author".*?>(.*?)</a>' - mobj = re.search(pattern, webpage) - if mobj: - uploader = mobj.group(1) - self.report_uploader(uploader) + uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>', + webpage, u'uploader', fatal=False) # Extract title # Get the first line for title + # TODO: implement default_value in search_regex video_title = u'NA' pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]' mobj = re.search(pattern, webpage) if mobj: video_title = mobj.group(1) - self.report_title(video_title) # Step 2, Stimulate clicking the image box to launch video - pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]' - mobj = re.search(pattern, webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video page URL') - - video_page = mobj.group(1) + video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]', + webpage, u'video page URL') webpage = self._download_webpage(video_page, video_id, u'Downloading video page') - self.report_extract_vid_page(video_page) - # Extract video links on video page """Extract video links of all sizes""" @@ -3220,6 +3166,8 @@ class NBAIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' + + # TODO: implement default_value in search_regex def _findProp(rexp, default=None): m = re.search(rexp, webpage) if m: @@ -3383,11 +3331,11 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL) - if not m: - raise ExtractorError(u'Unable to find video information') - video_url = unescapeHTML(m.group('url')) + video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', + webpage, u'video URL', flags=re.DOTALL) + video_url = unescapeHTML(video_url) + # TODO: implement fallbacks in regex_search m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL) if not m: m = re.search(r'<title>(?P<title>[^<]+?)', webpage) @@ -3395,18 +3343,16 @@ class FunnyOrDieIE(InfoExtractor): raise ExtractorError(u'Cannot find video title') title = clean_html(m.group('title')) - m = re.search(r'.+)"',webpage) - title = m.group('title') - m = re.search(r'data-content-type="channel".*?>(?P.*?)', - webpage, re.DOTALL) - uploader = unescapeHTML(m.group('uploader').strip()) - m = re.search(r'.+)"', + webpage, u'title') + + uploader = self._search_regex(r'data-content-type="channel".*?>(?P.*?)', + webpage, u'uploader', fatal=False, flags=re.DOTALL) + if uploader: uploader = unescapeHTML(uploader.strip()) + + thumbnail = self._search_regex(r'(.*)", webpage_src) + video_title = self._search_regex(r"(.*)", + webpage_src, u'title') - if mobj is None: - raise ExtractorError(u'Cannot determine title') - title = mobj.group(1) - - mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src) # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - if mobj is not None: - thumbnail = mobj.group(1) - else: + thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />', + webpage_src, u'thumbnail', fatal=False) + + if not thumbnail: _title = r"""candytitles.*>(.*)""" mobj = re.search(_title, webpage_src) if mobj is not None: - title = mobj.group(1) - thumbnail = None + video_title = mobj.group(1) results = [{ 'id': video_id, 'url' : video_url, - 'title' : title, + 'title' : video_title, 'thumbnail' : thumbnail, 'ext' : ext, }] @@ -3542,10 +3482,9 @@ class RBMARadioIE(InfoExtractor): video_id = m.group('videoID') webpage = self._download_webpage(url, video_id) - m = re.search(r'', webpage) - if not m: - raise ExtractorError(u'Cannot find metadata') - json_data = m.group(1) + + json_data = self._search_regex(r'', + webpage, u'json data') try: data = json.loads(json_data) @@ -3592,7 +3531,6 @@ class YouPornIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('videoid') req = compat_urllib_request.Request(url) @@ -3600,34 +3538,23 @@ class YouPornIE(InfoExtractor): webpage = self._download_webpage(req, video_id) # Get the video title - result = re.search(r'(?P.*)</h1>', webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - video_title = result.group('title').strip() + video_title = self._search_regex(r'<h1.*?>(?P<title>.*)</h1>', + webpage, u'title').strip() # Get the video date - result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage) - if result is None: - self._downloader.report_warning(u'unable to extract video date') - upload_date = None - else: - upload_date = unified_strdate(result.group('date').strip()) + upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>', + webpage, u'upload date', fatal=False) + if upload_date: upload_date = unified_strdate(upload_date.strip()) # Get the video uploader - result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage) - if result is None: - self._downloader.report_warning(u'unable to extract uploader') - video_uploader = None - else: - video_uploader = result.group('uploader').strip() - video_uploader = clean_html( video_uploader ) + video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>', + webpage, u'uploader', fatal=False) + if video_uploader: video_uploader = clean_html(video_uploader.strip()) # Get all of the formats available DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' - result = re.search(DOWNLOAD_LIST_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract download list') - download_list_html = result.group('download_list').strip() + download_list_html = self._search_regex(DOWNLOAD_LIST_RE, + webpage, u'download list').strip() # Get all of the links from the page LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' @@ -3704,17 +3631,13 @@ class PornotubeIE(InfoExtractor): # Get the video URL VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' - result = re.search(VIDEO_URL_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video url') - video_url = compat_urllib_parse.unquote(result.group('url')) + video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url') + video_url = compat_urllib_parse.unquote(video_url) #Get the uploaded date VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' - result = re.search(VIDEO_UPLOADED_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - upload_date = unified_strdate(result.group('date')) + upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) + if upload_date: upload_date = unified_strdate(upload_date) info = {'id': video_id, 'url': video_url, @@ -3741,10 +3664,8 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # Get the video title - result = re.search(r'<title>(?P<title>.*)', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract video title') - video_title = result.group('title').strip() + video_title = self._search_regex(r'(?P<title>.*)', + webpage, u'title').strip() # Get the embed page result = re.search(r'https?://www.youjizz.com/videos/embed/(?P[0-9]+)', webpage) @@ -3757,10 +3678,8 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL - result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract video url') - video_url = result.group('source') + video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', + webpage, u'video URL') info = {'id': video_id, 'url': video_url, @@ -3783,10 +3702,7 @@ class EightTracksIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL) - if not m: - raise ExtractorError(u'Cannot find trax information') - json_like = m.group(1) + json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL) data = json.loads(json_like) session = str(random.randint(0, 1000000000)) @@ -3822,18 +3738,24 @@ class KeekIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') + video_url = u'http://cdn.keek.com/keek/video/%s' % video_id thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - m = re.search(r'[\S\s]+?

(?P.+?)

', webpage) - uploader = clean_html(m.group('uploader')) + + video_title = self._search_regex(r'[\S\s]+?

(?P.+?)

', + webpage, u'uploader', fatal=False) + if uploader: uploader = clean_html(uploader) + info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, + 'title': video_title, 'thumbnail': thumbnail, 'uploader': uploader } @@ -3980,10 +3902,10 @@ class SpiegelIE(InfoExtractor): video_id = m.group('videoID') webpage = self._download_webpage(url, video_id) - m = re.search(r'
(.*?)
', webpage) - if not m: - raise ExtractorError(u'Cannot find title') - video_title = unescapeHTML(m.group(1)) + + video_title = self._search_regex(r'
(.*?)
', + webpage, u'title') + video_title = unescapeHTML(video_title) xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' xml_code = self._download_webpage(xml_url, video_id, @@ -4019,35 +3941,27 @@ class LiveLeakIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - m = re.search(r'file: "(.*?)",', webpage) - if not m: - raise ExtractorError(u'Unable to find video url') - video_url = m.group(1) + video_url = self._search_regex(r'file: "(.*?)",', + webpage, u'video URL') - m = re.search(r'', webpage) - if m: - uploader = clean_html(m.group(1)) - else: - uploader = None + video_uploader = self._search_regex(r'By:.*?(\w+)', + webpage, u'uploader', fatal=False) info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, - 'description': desc, - 'uploader': uploader + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader } return [info] @@ -4105,23 +4019,24 @@ class TumblrIE(InfoExtractor): re_video = r'src=\\x22(?Phttp://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P.*?)\\x22' % (blog, video_id) video = re.search(re_video, webpage) if video is None: - self.to_screen("No video found") - return [] + raise ExtractorError(u'Unable to extract video') video_url = video.group('video_url') ext = video.group('ext') - re_thumb = r'posters(.*?)\[\\x22(?P.*?)\\x22' # We pick the first poster - thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '') + video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P.*?)\\x22', + webpage, u'thumbnail', fatal=False) # We pick the first poster + if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '') # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos - re_title = r'(?P<title>.*?)' - title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title')) + video_title = self._search_regex(r'(?P<title>.*?)', + webpage, u'title', flags=re.DOTALL) + video_title = unescapeHTML(video_title) return [{'id': video_id, 'url': video_url, - 'title': title, - 'thumbnail': thumb, + 'title': video_title, + 'thumbnail': video_thumbnail, 'ext': ext }] @@ -4135,7 +4050,7 @@ class BandcampIE(InfoExtractor): # We get the link to the free download page m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if m_download is None: - raise ExtractorError(u'No free songs founded') + raise ExtractorError(u'No free songs found') download_link = m_download.group(1) id = re.search(r'var TralbumData = {(.*?)id: (?P\d*?)$', @@ -4163,10 +4078,10 @@ class BandcampIE(InfoExtractor): track_info = {'id':id, 'title' : info[u'title'], - 'ext' : 'mp3', - 'url' : final_url, + 'ext' : 'mp3', + 'url' : final_url, 'thumbnail' : info[u'thumb_url'], - 'uploader' : info[u'artist'] + 'uploader' : info[u'artist'] } return [track_info] @@ -4183,17 +4098,14 @@ class RedTubeIE(InfoExtractor): video_id = mobj.group('id') video_extension = 'mp4' webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) - mobj = re.search(r'',webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') + video_url = self._search_regex(r'', + webpage, u'video URL') - video_url = mobj.group(1) - mobj = re.search('

(.+)

',webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._search_regex('

(.+?)

', + webpage, u'title') return [{ 'id': video_id, @@ -4214,15 +4126,13 @@ class InaIE(InfoExtractor): video_extension = 'mp4' webpage = self._download_webpage(mrss_url, video_id) - mobj = re.search(r'.*?)]]>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_url = self._search_regex(r'.*?)]]>', + webpage, u'title') return [{ 'id': video_id, @@ -4244,27 +4154,17 @@ class HowcastIE(InfoExtractor): self.report_extraction(video_id) - mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video URL') - video_url = mobj.group(1) + video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', + webpage, u'video URL') - mobj = re.search(r'\w+)' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -4289,25 +4188,17 @@ class VineIE(InfoExtractor): self.report_extraction(video_id) - mobj = re.search(r'.*?

(.+?)

', webpage, re.DOTALL) - if mobj is None: - raise ExtractorError(u'Unable to extract uploader') - uploader = mobj.group(1) + uploader = self._search_regex(r'
.*?

(.+?)

', + webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ 'id': video_id, @@ -4330,18 +4221,13 @@ class FlickrIE(InfoExtractor): webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id webpage = self._download_webpage(webpage_url, video_id) - mobj = re.search(r"photo_secret: '(\w+)'", webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video secret') - secret = mobj.group(1) + secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret') first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') - mobj = re.search(r'(\d+-\d+)', first_xml) - if mobj is None: - raise ExtractorError(u'Unable to extract node_id') - node_id = mobj.group(1) + node_id = self._search_regex(r'(\d+-\d+)', + first_xml, u'node_id') second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage') @@ -4353,22 +4239,14 @@ class FlickrIE(InfoExtractor): raise ExtractorError(u'Unable to extract video url') video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) - mobj = re.search(r'(.*?)', data) - if mobj is None: - raise ExtractorError(u'Unable to extract video url') - video_url = mobj.group(1) + + video_url = self._search_regex(r'(.*?)', + data, u'video URL') return [{ 'id': video_id, @@ -4423,7 +4294,7 @@ class TeamcocoIE(InfoExtractor): 'ext': 'mp4', 'title': video_title, 'thumbnail': thumbnail, - 'description': description, + 'description': video_description, }] class XHamsterIE(InfoExtractor): From 468e2e926b8d1f55d6ce67fee67e33a7fa6d8371 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Thu, 6 Jun 2013 14:35:08 +0200 Subject: [PATCH 02/12] implement fallbacks and defaults in _search_regex --- youtube_dl/InfoExtractors.py | 86 +++++++++++++++++++----------------- youtube_dl/utils.py | 3 ++ 2 files changed, 48 insertions(+), 41 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 4d13c17e44..fbf40f3ca9 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -191,19 +191,37 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info - def _search_regex(self, pattern, text, name, fatal=True, flags=0): - """Extract a field from some text based on regex""" - mobj = re.search(pattern, text, flags) - if mobj is None and fatal: - raise ExtractorError(u'Unable to extract %s; ' - u'please report this issue on GitHub.' % name) - elif mobj is None: - self._downloader.report_warning(u'unable to extract %s; ' - u'please report this issue on GitHub.' % name) - return None + def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Perform a regex search on the given string, using a single or a list of + patterns returning the first matching group. + In case of failure return a default value or raise a WARNING or a + ExtractorError, depending on fatal, specifying the field name. + """ + if isinstance(pattern, (str, compat_str, compiled_regex_type)): + mobj = re.search(pattern, string, flags) else: - # return the first matched group + for p in pattern: + mobj = re.search(p, string, flags) + if mobj: break + + if sys.stderr.isatty() and os.name != 'nt': + _name = u'\033[0;34m%s\033[0m' % name + else: + _name = name + + if mobj: + # return the first matching group return next(g for g in mobj.groups() if g is not None) + elif default is not None: + return default + elif fatal: + raise ExtractorError(u'Unable to extract %s; ' + u'please report this issue on GitHub.' % _name) + else: + self._downloader.report_warning(u'unable to extract %s; ' + u'please report this issue on GitHub.' % _name) + return None class SearchInfoExtractor(InfoExtractor): """ @@ -2820,12 +2838,8 @@ class StanfordOpenClassroomIE(InfoExtractor): note='Downloading course info page', errnote='Unable to download course info page') - # TODO: implement default_value in search_regex - m = re.search('

([^<]+)

', coursepage) - if m: - info['title'] = unescapeHTML(m.group(1)) - else: - info['title'] = info['id'] + info['title'] = self._search_regex('

([^<]+)

', coursepage, 'title', default=info['id']) + info['title'] = unescapeHTML(info['title']) info['description'] = self._search_regex('([^<]+)', coursepage, u'description', fatal=False) @@ -3108,12 +3122,8 @@ class GooglePlusIE(InfoExtractor): # Extract title # Get the first line for title - # TODO: implement default_value in search_regex - video_title = u'NA' - pattern = r'Date:
(.*?)
', webpage, 'upload_date', fatal=False) + + description = self._search_regex(r'
(.*?)', webpage, 'description', fatal=False) + info = { 'id': shortened_video_id, 'url': video_url, 'ext': 'mp4', 'title': title, - 'uploader_date': _findProp(r'Date: (.*?)
'), - 'description': _findProp(r'
(.*?)'), + 'uploader_date': uploader_date, + 'description': description, } return [info] @@ -3335,13 +3343,9 @@ class FunnyOrDieIE(InfoExtractor): webpage, u'video URL', flags=re.DOTALL) video_url = unescapeHTML(video_url) - # TODO: implement fallbacks in regex_search - m = re.search(r"

(?P.*?)</h1>", webpage, flags=re.DOTALL) - if not m: - m = re.search(r'<title>(?P<title>[^<]+?)', webpage) - if not m: - raise ExtractorError(u'Cannot find video title') - title = clean_html(m.group('title')) + title = self._search_regex((r"

(?P.*?)</h1>", + r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) + title = clean_html(title) video_description = self._search_regex(r' Date: Thu, 6 Jun 2013 15:07:05 +0200 Subject: [PATCH 03/12] print WARNINGs during test + minor fix to NBAIE --- test/test_download.py | 9 +++++++++ youtube_dl/InfoExtractors.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/test/test_download.py b/test/test_download.py index 3eca333f26..3e6bdd44e3 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -40,9 +40,18 @@ def _try_rm(filename): class FileDownloader(youtube_dl.FileDownloader): def __init__(self, *args, **kwargs): + self._to_stderr = self.to_stderr self.to_stderr = self.to_screen self.processed_info_dicts = [] return youtube_dl.FileDownloader.__init__(self, *args, **kwargs) + def report_warning(self, message): + # let warnings pass to output + if sys.stderr.isatty() and os.name != 'nt': + _msg_header=u'\033[0;33mWARNING:\033[0m' + else: + _msg_header=u'WARNING:' + warning_message=u'%s %s' % (_msg_header,message) + self._to_stderr(warning_message) def process_info(self, info_dict): self.processed_info_dicts.append(info_dict) return youtube_dl.FileDownloader.process_info(self, info_dict) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index fbf40f3ca9..0f1880756f 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3183,7 +3183,7 @@ class NBAIE(InfoExtractor): uploader_date = self._search_regex(r'Date: (.*?)

', webpage, 'upload_date', fatal=False) - description = self._search_regex(r'
(.*?)', webpage, 'description', fatal=False) + description = self._search_regex(r'', webpage, 'description', fatal=False) info = { 'id': shortened_video_id, From be95cac157a75da1a0fa512b36eb90bc2c28cc96 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 7 Jun 2013 11:19:27 +0200 Subject: [PATCH 04/12] raise exceptions on warnings during tests - and solve a couple of them --- test/test_download.py | 10 ++------- youtube_dl/InfoExtractors.py | 41 ++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 3e6bdd44e3..565b1ebc55 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -40,18 +40,12 @@ def _try_rm(filename): class FileDownloader(youtube_dl.FileDownloader): def __init__(self, *args, **kwargs): - self._to_stderr = self.to_stderr self.to_stderr = self.to_screen self.processed_info_dicts = [] return youtube_dl.FileDownloader.__init__(self, *args, **kwargs) def report_warning(self, message): - # let warnings pass to output - if sys.stderr.isatty() and os.name != 'nt': - _msg_header=u'\033[0;33mWARNING:\033[0m' - else: - _msg_header=u'WARNING:' - warning_message=u'%s %s' % (_msg_header,message) - self._to_stderr(warning_message) + # Don't accept warnings during tests + raise ExtractorError(message) def process_info(self, info_dict): self.processed_info_dicts.append(info_dict) return youtube_dl.FileDownloader.process_info(self, info_dict) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 0f1880756f..bd6fce3b6f 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3161,7 +3161,7 @@ class GooglePlusIE(InfoExtractor): }] class NBAIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$' + _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$' IE_NAME = u'nba' def _real_extract(self, url): @@ -3170,8 +3170,6 @@ class NBAIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(1) - if video_id.endswith('/index.html'): - video_id = video_id[:-len('/index.html')] webpage = self._download_webpage(url, video_id) @@ -3181,7 +3179,8 @@ class NBAIE(InfoExtractor): title = self._search_regex(r'Date: (.*?)
', webpage, 'upload_date', fatal=False) + # It isn't there in the HTML it returns to us + # uploader_date = self._search_regex(r'Date: (.*?)', webpage, 'upload_date', fatal=False) description = self._search_regex(r'', webpage, 'description', fatal=False) @@ -3190,7 +3189,7 @@ class NBAIE(InfoExtractor): 'url': video_url, 'ext': 'mp4', 'title': title, - 'uploader_date': uploader_date, + # 'uploader_date': uploader_date, 'description': description, } return [info] @@ -3541,19 +3540,22 @@ class YouPornIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) - # Get the video title - video_title = self._search_regex(r'(?P.*)</h1>', - webpage, u'title').strip() + # Get JSON parameters + json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters') + try: + params = json.loads(json_params) + except: + raise ExtractorError(u'Invalid JSON') - # Get the video date - upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>', - webpage, u'upload date', fatal=False) - if upload_date: upload_date = unified_strdate(upload_date.strip()) - - # Get the video uploader - video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>', - webpage, u'uploader', fatal=False) - if video_uploader: video_uploader = clean_html(video_uploader.strip()) + self.report_extraction(video_id) + try: + video_title = params['title'] + upload_date = unified_strdate(params['release_date_f']) + video_description = params['description'] + video_uploader = params['submitted_by'] + thumbnail = params['thumbnails'][0]['image'] + except KeyError: + raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) # Get all of the formats available DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' @@ -3592,9 +3594,8 @@ class YouPornIE(InfoExtractor): 'title': title, 'ext': extension, 'format': format, - 'thumbnail': None, - 'description': None, - 'player_url': None + 'thumbnail': thumbnail, + 'description': video_description }) if self._downloader.params.get('listformats', None): From 8409501206e37d57f01e5fe72bfc54a5562e4e0a Mon Sep 17 00:00:00 2001 From: Filippo Valsorda <filippo.valsorda@gmail.com> Date: Fri, 7 Jun 2013 11:46:03 +0200 Subject: [PATCH 05/12] use search_regex in new IEs --- youtube_dl/InfoExtractors.py | 50 ++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index bd6fce3b6f..5d54e93e78 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3347,7 +3347,7 @@ class FunnyOrDieIE(InfoExtractor): title = clean_html(title) video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"', - webpage, u'description', flags=re.DOTALL) + webpage, u'description', fatal=False, flags=re.DOTALL) if video_description: video_description = unescapeHTML(video_description) info = { @@ -4301,7 +4301,7 @@ class TeamcocoIE(InfoExtractor): 'thumbnail': thumbnail, 'description': video_description, }] - + class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html' @@ -4310,8 +4310,9 @@ class XHamsterIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - mrss_url='http://xhamster.com/movies/%s/.html' % video_id + mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id webpage = self._download_webpage(mrss_url, video_id) + mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) if mobj is None: raise ExtractorError(u'Unable to extract media URL') @@ -4321,32 +4322,26 @@ class XHamsterIE(InfoExtractor): video_url = mobj.group('server')+'/key='+mobj.group('file') video_extension = video_url.split('.')[-1] - mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = unescapeHTML(mobj.group('title')) + video_title = self._search_regex(r'(?P<title>.+?) - xHamster\.com', + webpage, u'title') + video_title = unescapeHTML(video_title) - mobj = re.search(r'Description: (?P[^<]+)', webpage) - if mobj is None: - video_description = u'' - else: - video_description = unescapeHTML(mobj.group('description')) + video_description = self._search_regex(r'Description: (?P[^<]+)', + webpage, u'description', fatal=False) + if video_description: video_description = unescapeHTML(video_description) mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract upload date') - video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') - - mobj = re.search(r']+>(?P[^>]+)', webpage) - if mobj is None: - video_uploader_id = u'anonymous' + if mobj: + video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') else: - video_uploader_id = mobj.group('uploader_id') + video_upload_date = None + self._downloader.report_warning(u'Unable to extract upload date') - mobj = re.search(r'\'image\':\'(?P[^\']+)\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract thumbnail URL') - video_thumbnail = mobj.group('thumbnail') + video_uploader_id = self._search_regex(r']+>(?P[^>]+)', + webpage, u'uploader id', default=u'anonymous') + + video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', + webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, @@ -4377,10 +4372,9 @@ class HypemIE(InfoExtractor): cookie = urlh.headers.get('Set-Cookie', '') self.report_extraction(track_id) - mobj = re.search(r'', response, flags=re.MULTILINE|re.DOTALL) - if mobj is None: - raise ExtractorError(u'Unable to extrack tracks') - html_tracks = mobj.group(1).strip() + + html_tracks = self._search_regex(r'', + response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() try: track_list = json.loads(html_tracks) track = track_list[u'tracks'][0] From 8b59a9861040482c9af58e85fb397353ea2e8080 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 7 Jun 2013 12:10:02 +0200 Subject: [PATCH 06/12] XHamster: Can't see the description anywhere in the UI --- youtube_dl/InfoExtractors.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 5d54e93e78..0d7db013bb 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -4326,9 +4326,10 @@ class XHamsterIE(InfoExtractor): webpage, u'title') video_title = unescapeHTML(video_title) - video_description = self._search_regex(r'Description: (?P[^<]+)', - webpage, u'description', fatal=False) - if video_description: video_description = unescapeHTML(video_description) + # Can't see the description anywhere in the UI + # video_description = self._search_regex(r'Description: (?P[^<]+)', + # webpage, u'description', fatal=False) + # if video_description: video_description = unescapeHTML(video_description) mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) if mobj: @@ -4348,7 +4349,7 @@ class XHamsterIE(InfoExtractor): 'url': video_url, 'ext': video_extension, 'title': video_title, - 'description': video_description, + # 'description': video_description, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, 'thumbnail': video_thumbnail From f5a290eed949b7726a8d745960bbe9c6b8b7de52 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sat, 8 Jun 2013 09:56:34 +0200 Subject: [PATCH 07/12] print "please report this issue on GitHub" on every ExtractorError --- youtube_dl/InfoExtractors.py | 3 +-- youtube_dl/utils.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 0d7db013bb..86cc7c7484 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -216,8 +216,7 @@ class InfoExtractor(object): elif default is not None: return default elif fatal: - raise ExtractorError(u'Unable to extract %s; ' - u'please report this issue on GitHub.' % _name) + raise ExtractorError(u'Unable to extract %s' % _name) else: self._downloader.report_warning(u'unable to extract %s; ' u'please report this issue on GitHub.' % _name) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 3a8dcf4d3c..718ee3aae0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -472,6 +472,7 @@ class ExtractorError(Exception): """Error during info extraction.""" def __init__(self, msg, tb=None): """ tb, if given, is the original traceback (so that it can be printed out). """ + msg = msg + u'; please report this issue on GitHub.' super(ExtractorError, self).__init__(msg) self.traceback = tb self.exc_info = sys.exc_info() # preserve original exception From d5979c5d55b0df11973b9a2b6630fd676e5726d1 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 9 Jun 2013 11:55:08 +0200 Subject: [PATCH 08/12] do not ask the user to report network errors --- youtube_dl/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 718ee3aae0..66ae41e319 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -12,7 +12,7 @@ import sys import traceback import zlib import email.utils -import json +import socket import datetime try: @@ -472,8 +472,11 @@ class ExtractorError(Exception): """Error during info extraction.""" def __init__(self, msg, tb=None): """ tb, if given, is the original traceback (so that it can be printed out). """ - msg = msg + u'; please report this issue on GitHub.' + + if not sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): + msg = msg + u'; please report this issue on GitHub.' super(ExtractorError, self).__init__(msg) + self.traceback = tb self.exc_info = sys.exc_info() # preserve original exception From 979a9dd4c4d46e0f2b11bc4bcac51ad8d446d186 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 9 Jun 2013 11:57:13 +0200 Subject: [PATCH 09/12] _html_search_regex with clean_html superpowers --- test/tests.json | 2 +- youtube_dl/InfoExtractors.py | 151 ++++++++++++++++------------------- 2 files changed, 72 insertions(+), 81 deletions(-) diff --git a/test/tests.json b/test/tests.json index c39d1d9c1c..82da27d5b5 100644 --- a/test/tests.json +++ b/test/tests.json @@ -325,7 +325,7 @@ "file": "wshh6a7q1ny0G34ZwuIO.mp4", "md5": "9d04de741161603bf7071bbf4e883186", "info_dict": { - "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick! " + "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" } }, { diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 86cc7c7484..6060a5988c 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -222,6 +222,16 @@ class InfoExtractor(object): u'please report this issue on GitHub.' % _name) return None + def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Like _search_regex, but strips HTML tags and unescapes entities. + """ + res = self._search_regex(pattern, string, name, default, fatal, flags) + if res: + return clean_html(res).strip() + else: + return res + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. @@ -1923,9 +1933,8 @@ class FacebookIE(InfoExtractor): video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] - video_title = self._search_regex('

([^<]+)

', + video_title = self._html_search_regex('

([^<]+)

', webpage, u'title') - video_title = unescapeHTML(video_title) info = { 'id': video_id, @@ -2087,7 +2096,7 @@ class MyVideoIE(InfoExtractor): self.report_extraction(video_id) video_url = mobj.group(1) + '.flv' - video_title = self._search_regex('([^<]+)', + video_title = self._html_search_regex('([^<]+)', webpage, u'title') video_ext = self._search_regex('[.](.+?)$', video_url, u'extension') @@ -2169,7 +2178,7 @@ class MyVideoIE(InfoExtractor): video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj') video_swfobj = compat_urllib_parse.unquote(video_swfobj) - video_title = self._search_regex("(.*?)", + video_title = self._html_search_regex("(.*?)", webpage, u'title') return [{ @@ -2371,17 +2380,14 @@ class EscapistIE(InfoExtractor): self.report_extraction(showName) webpage = self._download_webpage(url, showName) - videoDesc = self._search_regex('(.*?)\s+-\s+XVID', + video_title = self._html_search_regex(r'(.*?)\s+-\s+XVID', webpage, u'title') # Extract video thumbnail @@ -2665,7 +2671,7 @@ class InfoQIE(InfoExtractor): webpage, u'title') # Extract description - video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', + video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage, u'description', fatal=False) video_filename = video_url.split('/')[-1] @@ -2837,12 +2843,10 @@ class StanfordOpenClassroomIE(InfoExtractor): note='Downloading course info page', errnote='Unable to download course info page') - info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - info['title'] = unescapeHTML(info['title']) + info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - info['description'] = self._search_regex('<description>([^<]+)</description>', + info['description'] = self._html_search_regex('<description>([^<]+)</description>', coursepage, u'description', fatal=False) - if info['description']: info['description'] = unescapeHTML(info['description']) links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) info['list'] = [ @@ -2903,15 +2907,13 @@ class MTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', + song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage, u'song name', fatal=False) - if song_name: song_name = unescapeHTML(song_name) - video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', + video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', webpage, u'title') - video_title = unescapeHTML(video_title) - mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', + mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage, u'mtvn_uri', fatal=False) content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', @@ -3067,7 +3069,7 @@ class XNXXIE(InfoExtractor): webpage, u'video URL') video_url = compat_urllib_parse.unquote(video_url) - video_title = self._search_regex(self.VIDEO_TITLE_RE, + video_title = self._html_search_regex(self.VIDEO_TITLE_RE, webpage, u'title') video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE, @@ -3108,7 +3110,7 @@ class GooglePlusIE(InfoExtractor): self.report_extraction(video_id) # Extract update date - upload_date = self._search_regex('title="Timestamp">(.*?)</a>', + upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>', webpage, u'upload date', fatal=False) if upload_date: # Convert timestring to a format suitable for filename @@ -3116,12 +3118,12 @@ class GooglePlusIE(InfoExtractor): upload_date = upload_date.strftime('%Y%m%d') # Extract uploader - uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>', + uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>', webpage, u'uploader', fatal=False) # Extract title # Get the first line for title - video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', + video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', webpage, 'title', default=u'NA') # Step 2, Stimulate clicking the image box to launch video @@ -3175,13 +3177,13 @@ class NBAIE(InfoExtractor): video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' shortened_video_id = video_id.rpartition('/')[2] - title = self._search_regex(r'<meta property="og:title" content="(.*?)"', + title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"', webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '') # It isn't there in the HTML it returns to us - # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) + # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) - description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) + description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) info = { 'id': shortened_video_id, @@ -3337,17 +3339,14 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', + video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, u'video URL', flags=re.DOTALL) - video_url = unescapeHTML(video_url) - title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", + title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) - title = clean_html(title) - video_description = self._search_regex(r'.+)"', + video_title = self._html_search_regex(r'data-title="(?P.+)"', webpage, u'title') - uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', + uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', webpage, u'uploader', fatal=False, flags=re.DOTALL) - if uploader: uploader = unescapeHTML(uploader.strip()) - thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', + thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage, u'thumbnail', fatal=False) info = { @@ -3454,11 +3452,11 @@ class WorldStarHipHopIE(InfoExtractor): else: ext = 'flv' - video_title = self._search_regex(r"<title>(.*)", + video_title = self._html_search_regex(r"(.*)", webpage_src, u'title') # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />', + thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />', webpage_src, u'thumbnail', fatal=False) if not thumbnail: @@ -3640,7 +3638,7 @@ class PornotubeIE(InfoExtractor): #Get the uploaded date VIDEO_UPLOADED_RE = r'
Added (?P[0-9\/]+) by' - upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) + upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) if upload_date: upload_date = unified_strdate(upload_date) info = {'id': video_id, @@ -3668,7 +3666,7 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # Get the video title - video_title = self._search_regex(r'(?P<title>.*)', + video_title = self._html_search_regex(r'(?P<title>.*)', webpage, u'title').strip() # Get the embed page @@ -3747,13 +3745,11 @@ class KeekIE(InfoExtractor): thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - video_title = self._search_regex(r'[\S\s]+?

(?P.+?)

', + uploader = self._html_search_regex(r'
[\S\s]+?

(?P.+?)

', webpage, u'uploader', fatal=False) - if uploader: uploader = clean_html(uploader) info = { 'id': video_id, @@ -3907,9 +3903,8 @@ class SpiegelIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_title = self._search_regex(r'
(.*?)
', + video_title = self._html_search_regex(r'
(.*?)
', webpage, u'title') - video_title = unescapeHTML(video_title) xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' xml_code = self._download_webpage(xml_url, video_id, @@ -3948,15 +3943,13 @@ class LiveLeakIE(InfoExtractor): video_url = self._search_regex(r'file: "(.*?)",', webpage, u'video URL') - video_title = self._search_regex(r'', + video_uploader = self._html_search_regex(r'By:.*?(\w+)
', webpage, u'uploader', fatal=False) info = { @@ -4033,9 +4026,8 @@ class TumblrIE(InfoExtractor): # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos - video_title = self._search_regex(r'(?P<title>.*?)', + video_title = self._html_search_regex(r'(?P<title>.*?)', webpage, u'title', flags=re.DOTALL) - video_title = unescapeHTML(video_title) return [{'id': video_id, 'url': video_url, @@ -4105,10 +4097,10 @@ class RedTubeIE(InfoExtractor): self.report_extraction(video_id) - video_url = self._search_regex(r'', + video_url = self._html_search_regex(r'', webpage, u'video URL') - video_title = self._search_regex('

(.+?)

', + video_title = self._html_search_regex('

(.+?)

', webpage, u'title') return [{ @@ -4132,7 +4124,7 @@ class InaIE(InfoExtractor): self.report_extraction(video_id) - video_url = self._search_regex(r'.*?)]]>', @@ -4161,13 +4153,13 @@ class HowcastIE(InfoExtractor): video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', webpage, u'video URL') - video_title = self._search_regex(r'.*?

(.+?)

', + uploader = self._html_search_regex(r'
.*?

(.+?)

', webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ @@ -4230,7 +4222,7 @@ class FlickrIE(InfoExtractor): first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') - node_id = self._search_regex(r'(\d+-\d+)', + node_id = self._html_search_regex(r'(\d+-\d+)', first_xml, u'node_id') second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' @@ -4243,13 +4235,13 @@ class FlickrIE(InfoExtractor): raise ExtractorError(u'Unable to extract video url') video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) - video_title = self._search_regex(r'(.*?)', + video_url = self._html_search_regex(r'(.*?)', data, u'video URL') return [{ @@ -4321,12 +4313,11 @@ class XHamsterIE(InfoExtractor): video_url = mobj.group('server')+'/key='+mobj.group('file') video_extension = video_url.split('.')[-1] - video_title = self._search_regex(r'(?P<title>.+?) - xHamster\.com', + video_title = self._html_search_regex(r'(?P<title>.+?) - xHamster\.com', webpage, u'title') - video_title = unescapeHTML(video_title) # Can't see the description anywhere in the UI - # video_description = self._search_regex(r'Description: (?P[^<]+)', + # video_description = self._html_search_regex(r'Description: (?P[^<]+)', # webpage, u'description', fatal=False) # if video_description: video_description = unescapeHTML(video_description) @@ -4337,7 +4328,7 @@ class XHamsterIE(InfoExtractor): video_upload_date = None self._downloader.report_warning(u'Unable to extract upload date') - video_uploader_id = self._search_regex(r']+>(?P[^>]+)', + video_uploader_id = self._html_search_regex(r']+>(?P[^>]+)', webpage, u'uploader id', default=u'anonymous') video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', @@ -4373,7 +4364,7 @@ class HypemIE(InfoExtractor): self.report_extraction(track_id) - html_tracks = self._search_regex(r'', + html_tracks = self._html_search_regex(r'', response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() try: track_list = json.loads(html_tracks) From 78d3442b1209d3858cfea1f7ca958f661784b5ab Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 9 Jun 2013 14:21:42 +0200 Subject: [PATCH 10/12] test: extend the reach of info_dict checking * print the info_dict in a format suitable to easy adding to tests.json during tests if un-tested fields are detected * make it possible to put the crc32 in tests.json if the field is too long * complete the "info_dict" fields in existing tests * fixed the bugs catched doing this --- test/test_download.py | 21 +++- test/tests.json | 185 ++++++++++++++++++++++++++++------- youtube_dl/InfoExtractors.py | 17 ++-- 3 files changed, 177 insertions(+), 46 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 565b1ebc55..8621520336 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -7,8 +7,8 @@ import os import json import unittest import sys -import hashlib import socket +import binascii # Allow direct execution sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -38,6 +38,9 @@ def _try_rm(filename): if ose.errno != errno.ENOENT: raise +def crc32(value): + return '%08x' % (binascii.crc32(value.encode('utf8')) & 0xffffffff) + class FileDownloader(youtube_dl.FileDownloader): def __init__(self, *args, **kwargs): self.to_stderr = self.to_screen @@ -124,7 +127,21 @@ def generator(test_case): with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof: info_dict = json.load(infof) for (info_field, value) in tc.get('info_dict', {}).items(): - self.assertEqual(value, info_dict.get(info_field)) + if isinstance(value, compat_str) and value.startswith('crc32:'): + self.assertEqual(value, 'crc32:' + crc32(info_dict.get(info_field))) + else: + self.assertEqual(value, info_dict.get(info_field)) + + # If checkable fields are missing from the test case, print the info_dict + test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'crc32:' + crc32(value)) + for key, value in info_dict.items() + if value and key in ('title', 'description', 'uploader', 'upload_date', 'uploader_id', 'location')) + if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()): + sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=2) + u'\n') + + # Check for the presence of mandatory fields + for key in ('id', 'url', 'title', 'ext'): + self.assertTrue(key in info_dict.keys() and info_dict[key]) finally: for tc in test_cases: _try_rm(tc['file']) diff --git a/test/tests.json b/test/tests.json index 82da27d5b5..e9abb0950f 100644 --- a/test/tests.json +++ b/test/tests.json @@ -15,43 +15,76 @@ "name": "Dailymotion", "md5": "392c4b85a60a90dc4792da41ce3144eb", "url": "http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech", - "file": "x33vw9.mp4" + "file": "x33vw9.mp4", + "info_dict": { + "uploader": "Alex and Van .", + "title": "Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" + } }, { "name": "Metacafe", "add_ie": ["Youtube"], "url": "http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/", - "file": "_aUehQsCQtM.flv" + "file": "_aUehQsCQtM.flv", + "info_dict": { + "upload_date": "20090102", + "title": "The Electric Company | \"Short I\" | PBS KIDS GO!", + "description": "crc32:5ef3bc57", + "uploader": "PBS", + "uploader_id": "PBS" + } }, { "name": "BlipTV", "md5": "b2d849efcf7ee18917e4b4d9ff37cafe", "url": "http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352", - "file": "5779306.m4v" + "file": "5779306.m4v", + "info_dict": { + "upload_date": "20111205", + "description": "crc32:fa658d49", + "uploader": "Comic Book Resources - CBR TV", + "title": "CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3" + } }, { "name": "XVideos", "md5": "1d0c835822f0a71a7bf011855db929d0", "url": "http://www.xvideos.com/video939581/funny_porns_by_s_-1", - "file": "939581.flv" + "file": "939581.flv", + "info_dict": { + "title": "Funny Porns By >>>>S<<<<<< -1" + } }, { "name": "YouPorn", "md5": "c37ddbaaa39058c76a7e86c6813423c1", "url": "http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/", - "file": "505835.mp4" + "file": "505835.mp4", + "info_dict": { + "upload_date": "20101221", + "description": "Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", + "uploader": "Ask Dan And Jennifer", + "title": "Sex Ed: Is It Safe To Masturbate Daily?" + } }, { "name": "Pornotube", "md5": "374dd6dcedd24234453b295209aa69b6", "url": "http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing", - "file": "1689755.flv" + "file": "1689755.flv", + "info_dict": { + "upload_date": "20090708", + "title": "Marilyn-Monroe-Bathing" + } }, { "name": "YouJizz", "md5": "07e15fa469ba384c7693fd246905547c", "url": "http://www.youjizz.com/videos/zeichentrick-1-2189178.html", - "file": "2189178.flv" + "file": "2189178.flv", + "info_dict": { + "title": "Zeichentrick 1" + } }, { "name": "Vimeo", @@ -70,61 +103,103 @@ "name": "Soundcloud", "md5": "ebef0a451b909710ed1d7787dddbf0d7", "url": "http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy", - "file": "62986583.mp3" + "file": "62986583.mp3", + "info_dict": { + "upload_date": "20121011", + "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd", + "uploader": "E.T. ExTerrestrial Music", + "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" + } }, { "name": "StanfordOpenClassroom", "md5": "544a9468546059d4e80d76265b0443b8", "url": "http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100", - "file": "PracticalUnix_intro-environment.mp4" + "file": "PracticalUnix_intro-environment.mp4", + "info_dict": { + "title": "Intro Environment" + } }, { "name": "XNXX", "md5": "0831677e2b4761795f68d417e0b7b445", "url": "http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_", - "file": "1135332.flv" + "file": "1135332.flv", + "info_dict": { + "title": "lida » Naked Funny Actress (5)" + } }, { "name": "Youku", "url": "http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html", "file": "XNDgyMDQ2NTQw_part00.flv", "md5": "ffe3f2e435663dc2d1eea34faeff5b5b", - "params": { "test": false } + "params": { "test": false }, + "info_dict": { + "title": "youtube-dl test video \"'/\\ä↭𝕐" + } }, { "name": "NBA", "url": "http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html", "file": "0021200253-okc-bkn-recap.nba.mp4", - "md5": "c0edcfc37607344e2ff8f13c378c88a4" + "md5": "c0edcfc37607344e2ff8f13c378c88a4", + "info_dict": { + "description": "Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.", + "title": "Thunder vs. Nets" + } }, { "name": "JustinTV", "url": "http://www.twitch.tv/thegamedevhub/b/296128360", "file": "296128360.flv", - "md5": "ecaa8a790c22a40770901460af191c9a" + "md5": "ecaa8a790c22a40770901460af191c9a", + "info_dict": { + "upload_date": "20110927", + "uploader_id": 25114803, + "uploader": "thegamedevhub", + "title": "Beginner Series - Scripting With Python Pt.1" + } }, { "name": "MyVideo", "url": "http://www.myvideo.de/watch/8229274/bowling_fail_or_win", "file": "8229274.flv", - "md5": "2d2753e8130479ba2cb7e0a37002053e" + "md5": "2d2753e8130479ba2cb7e0a37002053e", + "info_dict": { + "title": "bowling-fail-or-win" + } }, { "name": "Escapist", "url": "http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate", "file": "6618-Breaking-Down-Baldurs-Gate.mp4", - "md5": "c6793dbda81388f4264c1ba18684a74d" + "md5": "c6793dbda81388f4264c1ba18684a74d", + "info_dict": { + "description": "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.", + "uploader": "the-escapist-presents", + "title": "Breaking Down Baldur's Gate" + } }, { "name": "GooglePlus", "url": "https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH", - "file": "ZButuJc6CtH.flv" + "file": "ZButuJc6CtH.flv", + "info_dict": { + "upload_date": "20120613", + "uploader": "井上ヨシマサ", + "title": "嘆きの天使 降臨" + } }, { "name": "FunnyOrDie", "url": "http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version", "file": "0732f586d7.mp4", - "md5": "f647e9e90064b53b6e046e75d0241fbd" + "md5": "f647e9e90064b53b6e046e75d0241fbd", + "info_dict": { + "description": "Lyrics changed to match the video. Spoken cameo by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a concept by Dustin McLean (DustFilms.com). Performed, edited, and written by David A. Scott.", + "title": "Heart-Shaped Box: Literal Video Version" + } }, { "name": "Steam", @@ -161,6 +236,7 @@ "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things", "file": "12-jan-pythonthings.mp4", "info_dict": { + "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.", "title": "A Few of My Favorite [Python] Things" }, "params": { @@ -173,7 +249,10 @@ "file": "422212.mp4", "md5": "4e2f5cb088a83cd8cdb7756132f9739d", "info_dict": { - "title": "thedailyshow-kristen-stewart part 1" + "upload_date": "20121214", + "description": "Kristen Stewart", + "uploader": "thedailyshow", + "title": "thedailyshow-kristen-stewart part 1" } }, { @@ -224,42 +303,48 @@ "file": "11885679.m4a", "md5": "d30b5b5f74217410f4689605c35d1fd7", "info_dict": { - "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad" + "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885680.m4a", "md5": "4eb0a669317cd725f6bbd336a29f923a", "info_dict": { - "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad" + "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885682.m4a", "md5": "1893e872e263a2705558d1d319ad19e8", "info_dict": { - "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad" + "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885683.m4a", "md5": "b673c46f47a216ab1741ae8836af5899", "info_dict": { - "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad" + "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885684.m4a", "md5": "1d74534e95df54986da7f5abf7d842b7", "info_dict": { - "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad" + "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885685.m4a", "md5": "f081f47af8f6ae782ed131d38b9cd1c0", "info_dict": { - "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad" + "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } } ] @@ -270,9 +355,9 @@ "file": "NODfbab.mp4", "md5": "9b0636f8c0f7614afa4ea5e4c6e57e83", "info_dict": { + "uploader": "ytdl", "title": "test chars: \"'/\\ä<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de ." } - }, { "name": "TED", @@ -290,14 +375,19 @@ "file": "11741.mp4", "md5": "0b49f4844a068f8b33f4b7c88405862b", "info_dict": { - "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2" + "description": "Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?", + "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2" } }, { "name": "Generic", "url": "http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html", "file": "13601338388002.mp4", - "md5": "85b90ccc9d73b4acd9138d3af4c27f89" + "md5": "85b90ccc9d73b4acd9138d3af4c27f89", + "info_dict": { + "uploader": "www.hodiho.fr", + "title": "Régis plante sa Jeep" + } }, { "name": "Spiegel", @@ -355,42 +445,59 @@ "file":"30510138.mp3", "md5":"f9136bf103901728f29e419d2c70f55d", "info_dict": { - "title":"D-D-Dance" + "upload_date": "20111213", + "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", + "uploader": "The Royal Concept", + "title": "D-D-Dance" } }, { "file":"47127625.mp3", "md5":"09b6758a018470570f8fd423c9453dd8", "info_dict": { - "title":"The Royal Concept - Gimme Twice" + "upload_date": "20120521", + "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", + "uploader": "The Royal Concept", + "title": "The Royal Concept - Gimme Twice" } }, { "file":"47127627.mp3", "md5":"154abd4e418cea19c3b901f1e1306d9c", "info_dict": { - "title":"Goldrushed" + "upload_date": "20120521", + "uploader": "The Royal Concept", + "title": "Goldrushed" } }, { "file":"47127629.mp3", "md5":"2f5471edc79ad3f33a683153e96a79c1", "info_dict": { - "title":"In the End" + "upload_date": "20120521", + "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", + "uploader": "The Royal Concept", + "title": "In the End" } }, { "file":"47127631.mp3", "md5":"f9ba87aa940af7213f98949254f1c6e2", "info_dict": { - "title":"Knocked Up" + "upload_date": "20120521", + "description": "The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com", + "uploader": "The Royal Concept", + "title": "Knocked Up" } }, { "file":"75206121.mp3", "md5":"f9d1fe9406717e302980c30de4af9353", "info_dict": { - "title":"World On Fire" + "upload_date": "20130116", + "description": "The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central). \r\nAs a gift to our fans we would like to offer you a free download of the track! ", + "uploader": "The Royal Concept", + "title": "World On Fire" } } ] @@ -419,8 +526,10 @@ "url": "http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0", "file": "zpsc0c3b9fa.mp4", "md5": "7dabfb92b0a31f6c16cebc0f8e60ff99", - "info_dict":{ - "title":"Tired of Link Building? Try BacklinkMyDomain.com!" + "info_dict": { + "upload_date": "20130504", + "uploader": "rachaneronas", + "title": "Tired of Link Building? Try BacklinkMyDomain.com!" } }, { @@ -488,8 +597,10 @@ "url": "http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html", "file": "1509445.flv", "md5": "9f48e0e8d58e3076bb236ff412ab62fa", - "info_dict":{ - "title":"FemaleAgent Shy beauty takes the bait" + "info_dict": { + "upload_date": "20121014", + "uploader_id": "Ruseful2011", + "title": "FemaleAgent Shy beauty takes the bait" } }, { diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 6060a5988c..24e9c4cc7b 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -2377,8 +2377,8 @@ class EscapistIE(InfoExtractor): showName = mobj.group('showname') videoId = mobj.group('episode') - self.report_extraction(showName) - webpage = self._download_webpage(url, showName) + self.report_extraction(videoId) + webpage = self._download_webpage(url, videoId) videoDesc = self._html_search_regex(']+>(?P[^>]+)', + video_uploader_id = self._html_search_regex(r']+>(?P[^<]+)', webpage, u'uploader id', default=u'anonymous') video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', From ee55fcbe121baa0dacc9f87b9aa3abd974291355 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 9 Jun 2013 15:03:54 +0200 Subject: [PATCH 11/12] switch long info_dict fields checking to md5 --- test/test_download.py | 9 ++++----- test/tests.json | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 8621520336..577bcdbf2d 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -38,8 +38,7 @@ def _try_rm(filename): if ose.errno != errno.ENOENT: raise -def crc32(value): - return '%08x' % (binascii.crc32(value.encode('utf8')) & 0xffffffff) +md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() class FileDownloader(youtube_dl.FileDownloader): def __init__(self, *args, **kwargs): @@ -127,13 +126,13 @@ def generator(test_case): with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof: info_dict = json.load(infof) for (info_field, value) in tc.get('info_dict', {}).items(): - if isinstance(value, compat_str) and value.startswith('crc32:'): - self.assertEqual(value, 'crc32:' + crc32(info_dict.get(info_field))) + if isinstance(value, compat_str) and value.startswith('md5:'): + self.assertEqual(value, 'md5:' + md5(info_dict.get(info_field))) else: self.assertEqual(value, info_dict.get(info_field)) # If checkable fields are missing from the test case, print the info_dict - test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'crc32:' + crc32(value)) + test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) for key, value in info_dict.items() if value and key in ('title', 'description', 'uploader', 'upload_date', 'uploader_id', 'location')) if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()): diff --git a/test/tests.json b/test/tests.json index e9abb0950f..8a3e8e8e12 100644 --- a/test/tests.json +++ b/test/tests.json @@ -29,7 +29,7 @@ "info_dict": { "upload_date": "20090102", "title": "The Electric Company | \"Short I\" | PBS KIDS GO!", - "description": "crc32:5ef3bc57", + "description": "md5:2439a8ef6d5a70e380c22f5ad323e5a8", "uploader": "PBS", "uploader_id": "PBS" } @@ -41,7 +41,7 @@ "file": "5779306.m4v", "info_dict": { "upload_date": "20111205", - "description": "crc32:fa658d49", + "description": "md5:9bc31f227219cde65e47eeec8d2dc596", "uploader": "Comic Book Resources - CBR TV", "title": "CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3" } From af44c9486255f16ab180a9e45aaab06a6b38bdde Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 17 Jun 2013 19:25:35 +0200 Subject: [PATCH 12/12] use _search_regex in GenericIE --- youtube_dl/InfoExtractors.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 24e9c4cc7b..3c95012b19 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1430,16 +1430,12 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - mobj = re.search(r'(.*)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex(r'(.*)', + webpage, u'video title') # video uploader is domain name - mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_uploader = mobj.group(1) + video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', + url, u'video uploader') return [{ 'id': video_id,