From ac3e9394e76c0e8baeff1bc77eb67fa184ceb81c Mon Sep 17 00:00:00 2001
From: Anna Bernardi <anna.bernardi.9@gmail.com>
Date: Thu, 6 Jun 2013 13:27:27 +0200
Subject: [PATCH 01/12] Implement search_regex from #847

---
 youtube_dl/InfoExtractors.py | 633 ++++++++++++++---------------------
 1 file changed, 252 insertions(+), 381 deletions(-)
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index b40edf5fbb..4d13c17e44 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -191,6 +191,20 @@ class InfoExtractor(object):
             video_info['title'] = playlist_title
         return video_info
 
+    def _search_regex(self, pattern, text, name, fatal=True, flags=0):
+        """Extract a field from some text based on regex"""
+        mobj = re.search(pattern, text, flags)
+        if mobj is None and fatal:
+            raise ExtractorError(u'Unable to extract %s; '
+                u'please report this issue on GitHub.' % name)
+        elif mobj is None:
+            self._downloader.report_warning(u'unable to extract %s; '
+                u'please report this issue on GitHub.' % name)
+            return None
+        else:
+            # return the first matched group
+            return next(g for g in mobj.groups() if g is not None)
+
 class SearchInfoExtractor(InfoExtractor):
     """
     Base class for paged search queries extractors.
@@ -964,18 +978,13 @@ class PhotobucketIE(InfoExtractor):
             }]
 
         # We try looking in other parts of the webpage
-        mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract media URL')
-        mediaURL = compat_urllib_parse.unquote(mobj.group(1))
-
-        video_url = mediaURL
+        video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
+            webpage, u'video URL')
 
         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
         if mobj is None:
             raise ExtractorError(u'Unable to extract title')
         video_title = mobj.group(1).decode('utf-8')
-
         video_uploader = mobj.group(2).decode('utf-8')
 
         return [{
@@ -1803,10 +1812,7 @@ class DepositFilesIE(InfoExtractor):
         file_extension = os.path.splitext(file_url)[1][1:]
 
         # Search for file title
-        mobj = re.search(r'<b title="(.*?)">', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        file_title = mobj.group(1).decode('utf-8')
+        file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
 
         return [{
             'id':       file_id.decode('utf-8'),
@@ -1900,10 +1906,9 @@ class FacebookIE(InfoExtractor):
         video_duration = int(video_data['video_duration'])
         thumbnail = video_data['thumbnail_src']
 
-        m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
-        if not m:
-            raise ExtractorError(u'Cannot find title in webpage')
-        video_title = unescapeHTML(m.group(1))
+        video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
+            webpage, u'title')
+        video_title = unescapeHTML(video_title)
 
         info = {
             'id': video_id,
@@ -2065,15 +2070,10 @@ class MyVideoIE(InfoExtractor):
             self.report_extraction(video_id)
             video_url = mobj.group(1) + '.flv'
 
-            mobj = re.search('<title>([^<]+)</title>', webpage)
-            if mobj is None:
-                raise ExtractorError(u'Unable to extract title')
-            video_title = mobj.group(1)
+            video_title = self._search_regex('<title>([^<]+)</title>',
+                webpage, u'title')
 
-            mobj = re.search('[.](.+?)$', video_url)
-            if mobj is None:
-                raise ExtractorError(u'Unable to extract extention')
-            video_ext = mobj.group(1)
+            video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
 
             return [{
                 'id':       video_id,
@@ -2121,25 +2121,23 @@ class MyVideoIE(InfoExtractor):
         # extracting infos
         self.report_extraction(video_id)
 
+        video_url = None
         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
-        if mobj is None:
-            raise ExtractorError(u'unable to extract rtmpurl')
-        video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
-        if 'myvideo2flash' in video_rtmpurl:
-            self._downloader.report_warning(u'forcing RTMPT ...')
-            video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
+        if mobj:
+            video_url = compat_urllib_parse.unquote(mobj.group(1))
+            if 'myvideo2flash' in video_url:
+                self._downloader.report_warning(u'forcing RTMPT ...')
+                video_url = video_url.replace('rtmpe://', 'rtmpt://')
 
-        # extract non rtmp videos
-        if (video_rtmpurl is None) or (video_rtmpurl == ''):
+        if not video_url:
+            # extract non rtmp videos
             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
             if mobj is None:
                 raise ExtractorError(u'unable to extract url')
-            video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
+            video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
 
-        mobj = re.search('source=\'(.*?)\'', dec_data)
-        if mobj is None:
-            raise ExtractorError(u'unable to extract swfobj')
-        video_file     = compat_urllib_parse.unquote(mobj.group(1))
+        video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
+        video_file = compat_urllib_parse.unquote(video_file)
 
         if not video_file.endswith('f4m'):
             ppath, prefix = video_file.split('.')
@@ -2151,20 +2149,16 @@ class MyVideoIE(InfoExtractor):
                 video_filepath + video_file
             ).replace('.f4m', '.m3u8')
 
-        mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
-        if mobj is None:
-            raise ExtractorError(u'unable to extract swfobj')
-        video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
+        video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
+        video_swfobj = compat_urllib_parse.unquote(video_swfobj)
 
-        mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
-        if mobj is None:
-            raise ExtractorError(u'unable to extract title')
-        video_title = mobj.group(1)
+        video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
+            webpage, u'title')
 
         return [{
             'id':                 video_id,
-            'url':                video_rtmpurl,
-            'tc_url':             video_rtmpurl,
+            'url':                video_url,
+            'tc_url':             video_url,
             'uploader':           None,
             'upload_date':        None,
             'title':              video_title,
@@ -2175,6 +2169,7 @@ class MyVideoIE(InfoExtractor):
             'player_url':         video_swfobj,
         }]
 
+
 class ComedyCentralIE(InfoExtractor):
     """Information extractor for The Daily Show and Colbert Report """
 
@@ -2357,16 +2352,22 @@ class EscapistIE(InfoExtractor):
         videoId = mobj.group('episode')
 
         self.report_extraction(showName)
-        webPage = self._download_webpage(url, showName)
+        webpage = self._download_webpage(url, showName)
 
-        descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
-        description = unescapeHTML(descMatch.group(1))
-        imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
-        imgUrl = unescapeHTML(imgMatch.group(1))
-        playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
-        playerUrl = unescapeHTML(playerUrlMatch.group(1))
-        configUrlMatch = re.search('config=(.*)$', playerUrl)
-        configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
+        videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
+            webpage, u'description', fatal=False)
+        if videoDesc: videoDesc = unescapeHTML(videoDesc)
+
+        imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
+            webpage, u'thumbnail', fatal=False)
+        if imgUrl: imgUrl = unescapeHTML(imgUrl)
+
+        playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
+            webpage, u'player url')
+        playerUrl = unescapeHTML(playerUrl)
+
+        configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
+        configUrl = compat_urllib_parse.unquote(configUrl)
 
         configJSON = self._download_webpage(configUrl, showName,
                                             u'Downloading configuration',
@@ -2391,7 +2392,7 @@ class EscapistIE(InfoExtractor):
             'title': showName,
             'ext': 'mp4',
             'thumbnail': imgUrl,
-            'description': description,
+            'description': videoDesc,
             'player_url': playerUrl,
         }
 
@@ -2476,26 +2477,17 @@ class XVideosIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-
         # Extract video URL
-        mobj = re.search(r'flv_url=(.+?)&', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video url')
-        video_url = compat_urllib_parse.unquote(mobj.group(1))
-
+        video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
+            webpage, u'video URL'))
 
         # Extract title
-        mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video title')
-        video_title = mobj.group(1)
-
+        video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
+            webpage, u'title')
 
         # Extract video thumbnail
-        mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video thumbnail')
-        video_thumbnail = mobj.group(0)
+        video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
+            webpage, u'thumbnail', fatal=False)
 
         info = {
             'id': video_id,
@@ -2652,16 +2644,12 @@ class InfoQIE(InfoExtractor):
         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
 
         # Extract title
-        mobj = re.search(r'contentTitle = "(.*?)";', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video title')
-        video_title = mobj.group(1)
+        video_title = self._search_regex(r'contentTitle = "(.*?)";',
+            webpage, u'title')
 
         # Extract description
-        video_description = u'No description available.'
-        mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
-        if mobj is not None:
-            video_description = mobj.group(1)
+        video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
+            webpage, u'description', fatal=False)
 
         video_filename = video_url.split('/')[-1]
         video_id, extension = video_filename.split('.')
@@ -2832,15 +2820,16 @@ class StanfordOpenClassroomIE(InfoExtractor):
                                         note='Downloading course info page',
                                         errnote='Unable to download course info page')
 
+            # TODO: implement default_value in search_regex
             m = re.search('<h1>([^<]+)</h1>', coursepage)
             if m:
                 info['title'] = unescapeHTML(m.group(1))
             else:
                 info['title'] = info['id']
 
-            m = re.search('<description>([^<]+)</description>', coursepage)
-            if m:
-                info['description'] = unescapeHTML(m.group(1))
+            info['description'] = self._search_regex('<description>([^<]+)</description>',
+                coursepage, u'description', fatal=False)
+            if info['description']: info['description'] = unescapeHTML(info['description'])
 
             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
             info['list'] = [
@@ -2901,25 +2890,19 @@ class MTVIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract song name')
-        song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
-        mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract performer')
-        performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
-        video_title = performer + ' - ' + song_name
+        song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
+            webpage, u'song name', fatal=False)
+        if song_name: song_name = unescapeHTML(song_name)
 
-        mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to mtvn_uri')
-        mtvn_uri = mobj.group(1)
+        video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
+            webpage, u'title')
+        video_title = unescapeHTML(video_title)
 
-        mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract content id')
-        content_id = mobj.group(1)
+        mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
+            webpage, u'mtvn_uri', fatal=False)
+
+        content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
+            webpage, u'content id', fatal=False)
 
         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
         self.report_extraction(video_id)
@@ -3067,20 +3050,15 @@ class XNXXIE(InfoExtractor):
         # Get webpage content
         webpage = self._download_webpage(url, video_id)
 
-        result = re.search(self.VIDEO_URL_RE, webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract video url')
-        video_url = compat_urllib_parse.unquote(result.group(1))
+        video_url = self._search_regex(self.VIDEO_URL_RE,
+            webpage, u'video URL')
+        video_url = compat_urllib_parse.unquote(video_url)
 
-        result = re.search(self.VIDEO_TITLE_RE, webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract video title')
-        video_title = result.group(1)
+        video_title = self._search_regex(self.VIDEO_TITLE_RE,
+            webpage, u'title')
 
-        result = re.search(self.VIDEO_THUMB_RE, webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract video thumbnail')
-        video_thumbnail = result.group(1)
+        video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
+            webpage, u'thumbnail', fatal=False)
 
         return [{
             'id': video_id,
@@ -3100,26 +3078,6 @@ class GooglePlusIE(InfoExtractor):
     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
     IE_NAME = u'plus.google'
 
-    def report_extract_entry(self, url):
-        """Report downloading extry"""
-        self.to_screen(u'Downloading entry: %s' % url)
-
-    def report_date(self, upload_date):
-        """Report downloading extry"""
-        self.to_screen(u'Entry date: %s' % upload_date)
-
-    def report_uploader(self, uploader):
-        """Report downloading extry"""
-        self.to_screen(u'Uploader: %s' % uploader)
-
-    def report_title(self, video_title):
-        """Report downloading extry"""
-        self.to_screen(u'Title: %s' % video_title)
-
-    def report_extract_vid_page(self, video_page):
-        """Report information extraction."""
-        self.to_screen(u'Extracting video page: %s' % video_page)
-
     def _real_extract(self, url):
         # Extract id from URL
         mobj = re.match(self._VALID_URL, url)
@@ -3132,47 +3090,35 @@ class GooglePlusIE(InfoExtractor):
         video_extension = 'flv'
 
         # Step 1, Retrieve post webpage to extract further information
-        self.report_extract_entry(post_url)
         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
 
+        self.report_extraction(video_id)
+
         # Extract update date
-        upload_date = None
-        pattern = 'title="Timestamp">(.*?)</a>'
-        mobj = re.search(pattern, webpage)
-        if mobj:
-            upload_date = mobj.group(1)
+        upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
+            webpage, u'upload date', fatal=False)
+        if upload_date:
             # Convert timestring to a format suitable for filename
             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
             upload_date = upload_date.strftime('%Y%m%d')
-        self.report_date(upload_date)
 
         # Extract uploader
-        uploader = None
-        pattern = r'rel\="author".*?>(.*?)</a>'
-        mobj = re.search(pattern, webpage)
-        if mobj:
-            uploader = mobj.group(1)
-        self.report_uploader(uploader)
+        uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
+            webpage, u'uploader', fatal=False)
 
         # Extract title
         # Get the first line for title
+        # TODO: implement default_value in search_regex
         video_title = u'NA'
         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
         mobj = re.search(pattern, webpage)
         if mobj:
             video_title = mobj.group(1)
-        self.report_title(video_title)
 
         # Step 2, Stimulate clicking the image box to launch video
-        pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
-        mobj = re.search(pattern, webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video page URL')
-
-        video_page = mobj.group(1)
+        video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
+            webpage, u'video page URL')
         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
-        self.report_extract_vid_page(video_page)
-
 
         # Extract video links on video page
         """Extract video links of all sizes"""
@@ -3220,6 +3166,8 @@ class NBAIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
+
+        # TODO: implement default_value in search_regex
         def _findProp(rexp, default=None):
             m = re.search(rexp, webpage)
             if m:
@@ -3383,11 +3331,11 @@ class FunnyOrDieIE(InfoExtractor):
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
 
-        m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
-        if not m:
-            raise ExtractorError(u'Unable to find video information')
-        video_url = unescapeHTML(m.group('url'))
+        video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
+            webpage, u'video URL', flags=re.DOTALL)
+        video_url = unescapeHTML(video_url)
 
+        # TODO: implement fallbacks in regex_search
         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
         if not m:
             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
@@ -3395,18 +3343,16 @@ class FunnyOrDieIE(InfoExtractor):
                 raise ExtractorError(u'Cannot find video title')
         title = clean_html(m.group('title'))
 
-        m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
-        if m:
-            desc = unescapeHTML(m.group('desc'))
-        else:
-            desc = None
+        video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+            webpage, u'description', flags=re.DOTALL)
+        if video_description: video_description = unescapeHTML(video_description)
 
         info = {
             'id': video_id,
             'url': video_url,
             'ext': 'mp4',
             'title': title,
-            'description': desc,
+            'description': video_description,
         }
         return [info]
 
@@ -3462,27 +3408,30 @@ class UstreamIE(InfoExtractor):
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
         video_id = m.group('videoID')
+
         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
         webpage = self._download_webpage(url, video_id)
+
         self.report_extraction(video_id)
-        try:
-            m = re.search(r'data-title="(?P<title>.+)"',webpage)
-            title = m.group('title')
-            m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
-                          webpage, re.DOTALL)
-            uploader = unescapeHTML(m.group('uploader').strip())
-            m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
-            thumb = m.group('thumb')
-        except AttributeError:
-            raise ExtractorError(u'Unable to extract info')
+
+        video_title = self._search_regex(r'data-title="(?P<title>.+)"',
+            webpage, u'title')
+
+        uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
+            webpage, u'uploader', fatal=False, flags=re.DOTALL)
+        if uploader: uploader = unescapeHTML(uploader.strip())
+
+        thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
+            webpage, u'thumbnail', fatal=False)
+
         info = {
-                'id':video_id,
-                'url':video_url,
+                'id': video_id,
+                'url': video_url,
                 'ext': 'flv',
-                'title': title,
+                'title': video_title,
                 'uploader': uploader,
-                'thumbnail': thumb,
-                  }
+                'thumbnail': thumbnail,
+               }
         return info
 
 class WorldStarHipHopIE(InfoExtractor):
@@ -3490,45 +3439,36 @@ class WorldStarHipHopIE(InfoExtractor):
     IE_NAME = u'WorldStarHipHop'
 
     def _real_extract(self, url):
-        _src_url = r'so\.addVariable\("file","(.*?)"\)'
-
         m = re.match(self._VALID_URL, url)
         video_id = m.group('id')
 
-        webpage_src = self._download_webpage(url, video_id) 
+        webpage_src = self._download_webpage(url, video_id)
 
-        mobj = re.search(_src_url, webpage_src)
+        video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
+            webpage_src, u'video URL')
 
-        if mobj is not None:
-            video_url = mobj.group(1)
-            if 'mp4' in video_url:
-                ext = 'mp4'
-            else:
-                ext = 'flv'
+        if 'mp4' in video_url:
+            ext = 'mp4'
         else:
-            raise ExtractorError(u'Cannot find video url for %s' % video_id)
+            ext = 'flv'
 
-        mobj = re.search(r"<title>(.*)</title>", webpage_src)
+        video_title = self._search_regex(r"<title>(.*)</title>",
+            webpage_src, u'title')
 
-        if mobj is None:
-            raise ExtractorError(u'Cannot determine title')
-        title = mobj.group(1)
-
-        mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
-        if mobj is not None:
-            thumbnail = mobj.group(1)
-        else:
+        thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
+            webpage_src, u'thumbnail', fatal=False)
+
+        if not thumbnail:
             _title = r"""candytitles.*>(.*)</span>"""
             mobj = re.search(_title, webpage_src)
             if mobj is not None:
-                title = mobj.group(1)
-            thumbnail = None
+                video_title = mobj.group(1)
 
         results = [{
                     'id': video_id,
                     'url' : video_url,
-                    'title' : title,
+                    'title' : video_title,
                     'thumbnail' : thumbnail,
                     'ext' : ext,
                     }]
@@ -3542,10 +3482,9 @@ class RBMARadioIE(InfoExtractor):
         video_id = m.group('videoID')
 
         webpage = self._download_webpage(url, video_id)
-        m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
-        if not m:
-            raise ExtractorError(u'Cannot find metadata')
-        json_data = m.group(1)
+
+        json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
+            webpage, u'json data')
 
         try:
             data = json.loads(json_data)
@@ -3592,7 +3531,6 @@ class YouPornIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         if mobj is None:
             raise ExtractorError(u'Invalid URL: %s' % url)
-
         video_id = mobj.group('videoid')
 
         req = compat_urllib_request.Request(url)
@@ -3600,34 +3538,23 @@ class YouPornIE(InfoExtractor):
         webpage = self._download_webpage(req, video_id)
 
         # Get the video title
-        result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract video title')
-        video_title = result.group('title').strip()
+        video_title = self._search_regex(r'<h1.*?>(?P<title>.*)</h1>',
+            webpage, u'title').strip()
 
         # Get the video date
-        result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
-        if result is None:
-            self._downloader.report_warning(u'unable to extract video date')
-            upload_date = None
-        else:
-            upload_date = unified_strdate(result.group('date').strip())
+        upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>',
+            webpage, u'upload date', fatal=False)
+        if upload_date: upload_date = unified_strdate(upload_date.strip())
 
         # Get the video uploader
-        result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
-        if result is None:
-            self._downloader.report_warning(u'unable to extract uploader')
-            video_uploader = None
-        else:
-            video_uploader = result.group('uploader').strip()
-            video_uploader = clean_html( video_uploader )
+        video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>',
+            webpage, u'uploader', fatal=False)
+        if video_uploader: video_uploader = clean_html(video_uploader.strip())
 
         # Get all of the formats available
         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
-        result = re.search(DOWNLOAD_LIST_RE, webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract download list')
-        download_list_html = result.group('download_list').strip()
+        download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
+            webpage, u'download list').strip()
 
         # Get all of the links from the page
         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
@@ -3704,17 +3631,13 @@ class PornotubeIE(InfoExtractor):
 
         # Get the video URL
         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
-        result = re.search(VIDEO_URL_RE, webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract video url')
-        video_url = compat_urllib_parse.unquote(result.group('url'))
+        video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
+        video_url = compat_urllib_parse.unquote(video_url)
 
         #Get the uploaded date
         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
-        result = re.search(VIDEO_UPLOADED_RE, webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract video title')
-        upload_date = unified_strdate(result.group('date'))
+        upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
+        if upload_date: upload_date = unified_strdate(upload_date)
 
         info = {'id': video_id,
                 'url': video_url,
@@ -3741,10 +3664,8 @@ class YouJizzIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         # Get the video title
-        result = re.search(r'<title>(?P<title>.*)</title>', webpage)
-        if result is None:
-            raise ExtractorError(u'ERROR: unable to extract video title')
-        video_title = result.group('title').strip()
+        video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
+            webpage, u'title').strip()
 
         # Get the embed page
         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
@@ -3757,10 +3678,8 @@ class YouJizzIE(InfoExtractor):
         webpage = self._download_webpage(embed_page_url, video_id)
 
         # Get the video URL
-        result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
-        if result is None:
-            raise ExtractorError(u'ERROR: unable to extract video url')
-        video_url = result.group('source')
+        video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
+            webpage, u'video URL')
 
         info = {'id': video_id,
                 'url': video_url,
@@ -3783,10 +3702,7 @@ class EightTracksIE(InfoExtractor):
 
         webpage = self._download_webpage(url, playlist_id)
 
-        m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
-        if not m:
-            raise ExtractorError(u'Cannot find trax information')
-        json_like = m.group(1)
+        json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
         data = json.loads(json_like)
 
         session = str(random.randint(0, 1000000000))
@@ -3822,18 +3738,24 @@ class KeekIE(InfoExtractor):
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
         video_id = m.group('videoID')
+
         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
         webpage = self._download_webpage(url, video_id)
-        m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
-        title = unescapeHTML(m.group('title'))
-        m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
-        uploader = clean_html(m.group('uploader'))
+
+        video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+            webpage, u'title')
+        video_title = unescapeHTML(video_title)
+
+        uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
+            webpage, u'uploader', fatal=False)
+        if uploader: uploader = clean_html(uploader)
+
         info = {
                 'id': video_id,
                 'url': video_url,
                 'ext': 'mp4',
-                'title': title,
+                'title': video_title,
                 'thumbnail': thumbnail,
                 'uploader': uploader
         }
@@ -3980,10 +3902,10 @@ class SpiegelIE(InfoExtractor):
         video_id = m.group('videoID')
 
         webpage = self._download_webpage(url, video_id)
-        m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
-        if not m:
-            raise ExtractorError(u'Cannot find title')
-        video_title = unescapeHTML(m.group(1))
+
+        video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
+            webpage, u'title')
+        video_title = unescapeHTML(video_title)
 
         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
         xml_code = self._download_webpage(xml_url, video_id,
@@ -4019,35 +3941,27 @@ class LiveLeakIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        m = re.search(r'file: "(.*?)",', webpage)
-        if not m:
-            raise ExtractorError(u'Unable to find video url')
-        video_url = m.group(1)
+        video_url = self._search_regex(r'file: "(.*?)",',
+            webpage, u'video URL')
 
-        m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
-        if not m:
-            raise ExtractorError(u'Cannot find video title')
-        title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
+        video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+            webpage, u'title')
+        video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
 
-        m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
-        if m:
-            desc = unescapeHTML(m.group('desc'))
-        else:
-            desc = None
+        video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+            webpage, u'description', fatal=False)
+        if video_description: video_description = unescapeHTML(video_description)
 
-        m = re.search(r'By:.*?(\w+)</a>', webpage)
-        if m:
-            uploader = clean_html(m.group(1))
-        else:
-            uploader = None
+        video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
+            webpage, u'uploader', fatal=False)
 
         info = {
             'id':  video_id,
             'url': video_url,
             'ext': 'mp4',
-            'title': title,
-            'description': desc,
-            'uploader': uploader
+            'title': video_title,
+            'description': video_description,
+            'uploader': video_uploader
         }
 
         return [info]
@@ -4105,23 +4019,24 @@ class TumblrIE(InfoExtractor):
         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
         video = re.search(re_video, webpage)
         if video is None:
-            self.to_screen("No video found")
-            return []
+           raise ExtractorError(u'Unable to extract video')
         video_url = video.group('video_url')
         ext = video.group('ext')
 
-        re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
-        thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
+        video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
+            webpage, u'thumbnail', fatal=False)  # We pick the first poster
+        if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
 
         # The only place where you can get a title, it's not complete,
         # but searching in other places doesn't work for all videos
-        re_title = r'<title>(?P<title>.*?)</title>'
-        title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
+        video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
+            webpage, u'title', flags=re.DOTALL)
+        video_title = unescapeHTML(video_title)
 
         return [{'id': video_id,
                  'url': video_url,
-                 'title': title,
-                 'thumbnail': thumb,
+                 'title': video_title,
+                 'thumbnail': video_thumbnail,
                  'ext': ext
                  }]
 
@@ -4135,7 +4050,7 @@ class BandcampIE(InfoExtractor):
         # We get the link to the free download page
         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
         if m_download is None:
-            raise ExtractorError(u'No free songs founded')
+            raise ExtractorError(u'No free songs found')
 
         download_link = m_download.group(1)
         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
@@ -4163,10 +4078,10 @@ class BandcampIE(InfoExtractor):
 
         track_info = {'id':id,
                       'title' : info[u'title'],
-                      'ext' : 'mp3',
-                      'url' : final_url,
+                      'ext' :   'mp3',
+                      'url' :   final_url,
                       'thumbnail' : info[u'thumb_url'],
-                      'uploader' : info[u'artist']
+                      'uploader' :  info[u'artist']
                       }
 
         return [track_info]
@@ -4183,17 +4098,14 @@ class RedTubeIE(InfoExtractor):
         video_id = mobj.group('id')
         video_extension = 'mp4'        
         webpage = self._download_webpage(url, video_id)
+
         self.report_extraction(video_id)
-        mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
 
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract media URL')
+        video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
+            webpage, u'video URL')
 
-        video_url = mobj.group(1)
-        mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1)
+        video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
+            webpage, u'title')
 
         return [{
             'id':       video_id,
@@ -4214,15 +4126,13 @@ class InaIE(InfoExtractor):
         video_extension = 'mp4'
         webpage = self._download_webpage(mrss_url, video_id)
 
-        mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract media URL')
-        video_url = mobj.group(1)
+        self.report_extraction(video_id)
 
-        mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1)
+        video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
+            webpage, u'video URL')
+
+        video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
+            webpage, u'title')
 
         return [{
             'id':       video_id,
@@ -4244,27 +4154,17 @@ class HowcastIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-        mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video URL')
-        video_url = mobj.group(1)
+        video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
+            webpage, u'video URL')
 
-        mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1) or mobj.group(2)
+        video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
+            webpage, u'title')
 
-        mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
-        if mobj is None:
-            self._downloader.report_warning(u'unable to extract description')
-            video_description = None
-        else:
-            video_description = mobj.group(1) or mobj.group(2)
+        video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
+            webpage, u'description', fatal=False)
 
-        mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract thumbnail')
-        thumbnail = mobj.group(1)
+        thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
+            webpage, u'thumbnail', fatal=False)
 
         return [{
             'id':       video_id,
@@ -4280,7 +4180,6 @@ class VineIE(InfoExtractor):
     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 
     def _real_extract(self, url):
-
         mobj = re.match(self._VALID_URL, url)
 
         video_id = mobj.group('id')
@@ -4289,25 +4188,17 @@ class VineIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-        mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video URL')
-        video_url = mobj.group(1)
+        video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
+            webpage, u'video URL')
 
-        mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1)
+        video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+            webpage, u'title')
 
-        mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract thumbnail')
-        thumbnail = mobj.group(1)
+        thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
+            webpage, u'thumbnail', fatal=False)
 
-        mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract uploader')
-        uploader = mobj.group(1)
+        uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
+            webpage, u'uploader', fatal=False, flags=re.DOTALL)
 
         return [{
             'id':        video_id,
@@ -4330,18 +4221,13 @@ class FlickrIE(InfoExtractor):
         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
         webpage = self._download_webpage(webpage_url, video_id)
 
-        mobj = re.search(r"photo_secret: '(\w+)'", webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video secret')
-        secret = mobj.group(1)
+        secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 
         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 
-        mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract node_id')
-        node_id = mobj.group(1)
+        node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
+            first_xml, u'node_id')
 
         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
@@ -4353,22 +4239,14 @@ class FlickrIE(InfoExtractor):
             raise ExtractorError(u'Unable to extract video url')
         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 
-        mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1) or mobj.group(2)
+        video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
+            webpage, u'video title')
 
-        mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
-        if mobj is None:
-            self._downloader.report_warning(u'unable to extract description')
-            video_description = None
-        else:
-            video_description = mobj.group(1) or mobj.group(2)
+        video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
+            webpage, u'description', fatal=False)
 
-        mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract thumbnail')
-        thumbnail = mobj.group(1) or mobj.group(2)
+        thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
+            webpage, u'thumbnail', fatal=False)
 
         return [{
             'id':          video_id,
@@ -4390,32 +4268,25 @@ class TeamcocoIE(InfoExtractor):
         url_title = mobj.group('url_title')
         webpage = self._download_webpage(url, url_title)
 
-        mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
-        video_id = mobj.group(1)
+        video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
+            webpage, u'video id')
 
         self.report_extraction(video_id)
 
-        mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1)
+        video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+            webpage, u'title')
 
-        mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract thumbnail')
-        thumbnail = mobj.group(1)
+        thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
+            webpage, u'thumbnail', fatal=False)
 
-        mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract description')
-        description = mobj.group(1)
+        video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
+            webpage, u'description', fatal=False)
 
         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
-        mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video url')
-        video_url = mobj.group(1)
+
+        video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
+            data, u'video URL')
 
         return [{
             'id':          video_id,
@@ -4423,7 +4294,7 @@ class TeamcocoIE(InfoExtractor):
             'ext':         'mp4',
             'title':       video_title,
             'thumbnail':   thumbnail,
-            'description': description,
+            'description': video_description,
         }]
         
 class XHamsterIE(InfoExtractor):

From 468e2e926b8d1f55d6ce67fee67e33a7fa6d8371 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Thu, 6 Jun 2013 14:35:08 +0200
Subject: [PATCH 02/12] implement fallbacks and defaults in _search_regex

---
 youtube_dl/InfoExtractors.py | 86 +++++++++++++++++++-----------------
 youtube_dl/utils.py          |  3 ++
 2 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 4d13c17e44..fbf40f3ca9 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -191,19 +191,37 @@ class InfoExtractor(object):
             video_info['title'] = playlist_title
         return video_info
 
-    def _search_regex(self, pattern, text, name, fatal=True, flags=0):
-        """Extract a field from some text based on regex"""
-        mobj = re.search(pattern, text, flags)
-        if mobj is None and fatal:
-            raise ExtractorError(u'Unable to extract %s; '
-                u'please report this issue on GitHub.' % name)
-        elif mobj is None:
-            self._downloader.report_warning(u'unable to extract %s; '
-                u'please report this issue on GitHub.' % name)
-            return None
+    def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+        """
+        Perform a regex search on the given string, using a single or a list of
+        patterns returning the first matching group.
+        In case of failure return a default value or raise a WARNING or a
+        ExtractorError, depending on fatal, specifying the field name.
+        """
+        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
+            mobj = re.search(pattern, string, flags)
         else:
-            # return the first matched group
+            for p in pattern:
+                mobj = re.search(p, string, flags)
+                if mobj: break
+
+        if sys.stderr.isatty() and os.name != 'nt':
+            _name = u'\033[0;34m%s\033[0m' % name
+        else:
+            _name = name
+
+        if mobj:
+            # return the first matching group
             return next(g for g in mobj.groups() if g is not None)
+        elif default is not None:
+            return default
+        elif fatal:
+            raise ExtractorError(u'Unable to extract %s; '
+                u'please report this issue on GitHub.' % _name)
+        else:
+            self._downloader.report_warning(u'unable to extract %s; '
+                u'please report this issue on GitHub.' % _name)
+            return None
 
 class SearchInfoExtractor(InfoExtractor):
     """
@@ -2820,12 +2838,8 @@ class StanfordOpenClassroomIE(InfoExtractor):
                                         note='Downloading course info page',
                                         errnote='Unable to download course info page')
 
-            # TODO: implement default_value in search_regex
-            m = re.search('<h1>([^<]+)</h1>', coursepage)
-            if m:
-                info['title'] = unescapeHTML(m.group(1))
-            else:
-                info['title'] = info['id']
+            info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
+            info['title'] = unescapeHTML(info['title'])
 
             info['description'] = self._search_regex('<description>([^<]+)</description>',
                 coursepage, u'description', fatal=False)
@@ -3108,12 +3122,8 @@ class GooglePlusIE(InfoExtractor):
 
         # Extract title
         # Get the first line for title
-        # TODO: implement default_value in search_regex
-        video_title = u'NA'
-        pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
-        mobj = re.search(pattern, webpage)
-        if mobj:
-            video_title = mobj.group(1)
+        video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
+            webpage, 'title', default=u'NA')
 
         # Step 2, Stimulate clicking the image box to launch video
         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
@@ -3167,23 +3177,21 @@ class NBAIE(InfoExtractor):
 
         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
 
-        # TODO: implement default_value in search_regex
-        def _findProp(rexp, default=None):
-            m = re.search(rexp, webpage)
-            if m:
-                return unescapeHTML(m.group(1))
-            else:
-                return default
-
         shortened_video_id = video_id.rpartition('/')[2]
-        title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
+        title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
+            webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
+
+        uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
+
+        description = self._search_regex(r'<div class="description">(.*?)</h1>', webpage, 'description', fatal=False)
+
         info = {
             'id': shortened_video_id,
             'url': video_url,
             'ext': 'mp4',
             'title': title,
-            'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
-            'description': _findProp(r'<div class="description">(.*?)</h1>'),
+            'uploader_date': uploader_date,
+            'description': description,
         }
         return [info]
 
@@ -3335,13 +3343,9 @@ class FunnyOrDieIE(InfoExtractor):
             webpage, u'video URL', flags=re.DOTALL)
         video_url = unescapeHTML(video_url)
 
-        # TODO: implement fallbacks in regex_search
-        m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
-        if not m:
-            m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
-            if not m:
-                raise ExtractorError(u'Cannot find video title')
-        title = clean_html(m.group('title'))
+        title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
+            r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
+        title = clean_html(title)
 
         video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
             webpage, u'description', flags=re.DOTALL)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 63d9d0ae58..3a8dcf4d3c 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -154,6 +154,9 @@ def compat_ord(c):
     if type(c) is int: return c
     else: return ord(c)
 
+# This is not clearly defined otherwise
+compiled_regex_type = type(re.compile(''))
+
 std_headers = {
     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',

From 476203d025dd2619ea9f9e2f99ffce507dec6596 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Thu, 6 Jun 2013 15:07:05 +0200
Subject: [PATCH 03/12] print WARNINGs during test + minor fix to NBAIE

---
 test/test_download.py        | 9 +++++++++
 youtube_dl/InfoExtractors.py | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/test/test_download.py b/test/test_download.py
index 3eca333f26..3e6bdd44e3 100644
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -40,9 +40,18 @@ def _try_rm(filename):
 
 class FileDownloader(youtube_dl.FileDownloader):
     def __init__(self, *args, **kwargs):
+        self._to_stderr = self.to_stderr
         self.to_stderr = self.to_screen
         self.processed_info_dicts = []
         return youtube_dl.FileDownloader.__init__(self, *args, **kwargs)
+    def report_warning(self, message):
+        # let warnings pass to output
+        if sys.stderr.isatty() and os.name != 'nt':
+            _msg_header=u'\033[0;33mWARNING:\033[0m'
+        else:
+            _msg_header=u'WARNING:'
+        warning_message=u'%s %s' % (_msg_header,message)
+        self._to_stderr(warning_message)
     def process_info(self, info_dict):
         self.processed_info_dicts.append(info_dict)
         return youtube_dl.FileDownloader.process_info(self, info_dict)
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index fbf40f3ca9..0f1880756f 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -3183,7 +3183,7 @@ class NBAIE(InfoExtractor):
 
         uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
 
-        description = self._search_regex(r'<div class="description">(.*?)</h1>', webpage, 'description', fatal=False)
+        description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
 
         info = {
             'id': shortened_video_id,

From be95cac157a75da1a0fa512b36eb90bc2c28cc96 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Fri, 7 Jun 2013 11:19:27 +0200
Subject: [PATCH 04/12] raise exceptions on warnings during tests - and solve a
 couple of them

---
 test/test_download.py        | 10 ++-------
 youtube_dl/InfoExtractors.py | 41 ++++++++++++++++++------------------
 2 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/test/test_download.py b/test/test_download.py
index 3e6bdd44e3..565b1ebc55 100644
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -40,18 +40,12 @@ def _try_rm(filename):
 
 class FileDownloader(youtube_dl.FileDownloader):
     def __init__(self, *args, **kwargs):
-        self._to_stderr = self.to_stderr
         self.to_stderr = self.to_screen
         self.processed_info_dicts = []
         return youtube_dl.FileDownloader.__init__(self, *args, **kwargs)
     def report_warning(self, message):
-        # let warnings pass to output
-        if sys.stderr.isatty() and os.name != 'nt':
-            _msg_header=u'\033[0;33mWARNING:\033[0m'
-        else:
-            _msg_header=u'WARNING:'
-        warning_message=u'%s %s' % (_msg_header,message)
-        self._to_stderr(warning_message)
+        # Don't accept warnings during tests
+        raise ExtractorError(message)
     def process_info(self, info_dict):
         self.processed_info_dicts.append(info_dict)
         return youtube_dl.FileDownloader.process_info(self, info_dict)
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 0f1880756f..bd6fce3b6f 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -3161,7 +3161,7 @@ class GooglePlusIE(InfoExtractor):
         }]
 
 class NBAIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
+    _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
     IE_NAME = u'nba'
 
     def _real_extract(self, url):
@@ -3170,8 +3170,6 @@ class NBAIE(InfoExtractor):
             raise ExtractorError(u'Invalid URL: %s' % url)
 
         video_id = mobj.group(1)
-        if video_id.endswith('/index.html'):
-            video_id = video_id[:-len('/index.html')]
 
         webpage = self._download_webpage(url, video_id)
 
@@ -3181,7 +3179,8 @@ class NBAIE(InfoExtractor):
         title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
 
-        uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
+        # It isn't there in the HTML it returns to us
+        # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
 
         description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
 
@@ -3190,7 +3189,7 @@ class NBAIE(InfoExtractor):
             'url': video_url,
             'ext': 'mp4',
             'title': title,
-            'uploader_date': uploader_date,
+            # 'uploader_date': uploader_date,
             'description': description,
         }
         return [info]
@@ -3541,19 +3540,22 @@ class YouPornIE(InfoExtractor):
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
 
-        # Get the video title
-        video_title = self._search_regex(r'<h1.*?>(?P<title>.*)</h1>',
-            webpage, u'title').strip()
+        # Get JSON parameters
+        json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
+        try:
+            params = json.loads(json_params)
+        except:
+            raise ExtractorError(u'Invalid JSON')
 
-        # Get the video date
-        upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>',
-            webpage, u'upload date', fatal=False)
-        if upload_date: upload_date = unified_strdate(upload_date.strip())
-
-        # Get the video uploader
-        video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>',
-            webpage, u'uploader', fatal=False)
-        if video_uploader: video_uploader = clean_html(video_uploader.strip())
+        self.report_extraction(video_id)
+        try:
+            video_title = params['title']
+            upload_date = unified_strdate(params['release_date_f'])
+            video_description = params['description']
+            video_uploader = params['submitted_by']
+            thumbnail = params['thumbnails'][0]['image']
+        except KeyError:
+            raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 
         # Get all of the formats available
         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
@@ -3592,9 +3594,8 @@ class YouPornIE(InfoExtractor):
                 'title': title,
                 'ext': extension,
                 'format': format,
-                'thumbnail': None,
-                'description': None,
-                'player_url': None
+                'thumbnail': thumbnail,
+                'description': video_description
             })
 
         if self._downloader.params.get('listformats', None):

From 8409501206e37d57f01e5fe72bfc54a5562e4e0a Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Fri, 7 Jun 2013 11:46:03 +0200
Subject: [PATCH 05/12] use search_regex in new IEs

---
 youtube_dl/InfoExtractors.py | 50 ++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 28 deletions(-)

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index bd6fce3b6f..5d54e93e78 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -3347,7 +3347,7 @@ class FunnyOrDieIE(InfoExtractor):
         title = clean_html(title)
 
         video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
-            webpage, u'description', flags=re.DOTALL)
+            webpage, u'description', fatal=False, flags=re.DOTALL)
         if video_description: video_description = unescapeHTML(video_description)
 
         info = {
@@ -4301,7 +4301,7 @@ class TeamcocoIE(InfoExtractor):
             'thumbnail':   thumbnail,
             'description': video_description,
         }]
-        
+
 class XHamsterIE(InfoExtractor):
     """Information Extractor for xHamster"""
     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
@@ -4310,8 +4310,9 @@ class XHamsterIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
 
         video_id = mobj.group('id')
-        mrss_url='http://xhamster.com/movies/%s/.html' % video_id
+        mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
         webpage = self._download_webpage(mrss_url, video_id)
+
         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
         if mobj is None:
             raise ExtractorError(u'Unable to extract media URL')
@@ -4321,32 +4322,26 @@ class XHamsterIE(InfoExtractor):
             video_url = mobj.group('server')+'/key='+mobj.group('file')
         video_extension = video_url.split('.')[-1]
 
-        mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = unescapeHTML(mobj.group('title'))
+        video_title = self._search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
+            webpage, u'title')
+        video_title = unescapeHTML(video_title)
 
-        mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
-        if mobj is None:
-            video_description = u''
-        else:
-            video_description = unescapeHTML(mobj.group('description'))
+        video_description = self._search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
+            webpage, u'description', fatal=False)
+        if video_description: video_description = unescapeHTML(video_description)
 
         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract upload date')
-        video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
-
-        mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
-        if mobj is None:
-            video_uploader_id = u'anonymous'
+        if mobj:
+            video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
         else:
-            video_uploader_id = mobj.group('uploader_id')
+            video_upload_date = None
+            self._downloader.report_warning(u'Unable to extract upload date')
 
-        mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract thumbnail URL')
-        video_thumbnail = mobj.group('thumbnail')
+        video_uploader_id = self._search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
+            webpage, u'uploader id', default=u'anonymous')
+
+        video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
+            webpage, u'thumbnail', fatal=False)
 
         return [{
             'id':       video_id,
@@ -4377,10 +4372,9 @@ class HypemIE(InfoExtractor):
         cookie = urlh.headers.get('Set-Cookie', '')
 
         self.report_extraction(track_id)
-        mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extrack tracks')
-        html_tracks = mobj.group(1).strip()
+
+        html_tracks = self._search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
+            response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
         try:
             track_list = json.loads(html_tracks)
             track = track_list[u'tracks'][0]

From 8b59a9861040482c9af58e85fb397353ea2e8080 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Fri, 7 Jun 2013 12:10:02 +0200
Subject: [PATCH 06/12] XHamster: Can't see the description anywhere in the UI

---
 youtube_dl/InfoExtractors.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 5d54e93e78..0d7db013bb 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -4326,9 +4326,10 @@ class XHamsterIE(InfoExtractor):
             webpage, u'title')
         video_title = unescapeHTML(video_title)
 
-        video_description = self._search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
-            webpage, u'description', fatal=False)
-        if video_description: video_description = unescapeHTML(video_description)
+        # Can't see the description anywhere in the UI
+        # video_description = self._search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
+        #     webpage, u'description', fatal=False)
+        # if video_description: video_description = unescapeHTML(video_description)
 
         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
         if mobj:
@@ -4348,7 +4349,7 @@ class XHamsterIE(InfoExtractor):
             'url':      video_url,
             'ext':      video_extension,
             'title':    video_title,
-            'description': video_description,
+            # 'description': video_description,
             'upload_date': video_upload_date,
             'uploader_id': video_uploader_id,
             'thumbnail': video_thumbnail

From f5a290eed949b7726a8d745960bbe9c6b8b7de52 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Sat, 8 Jun 2013 09:56:34 +0200
Subject: [PATCH 07/12] print "please report this issue on GitHub" on every
 ExtractorError

---
 youtube_dl/InfoExtractors.py | 3 +--
 youtube_dl/utils.py          | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 0d7db013bb..86cc7c7484 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -216,8 +216,7 @@ class InfoExtractor(object):
         elif default is not None:
             return default
         elif fatal:
-            raise ExtractorError(u'Unable to extract %s; '
-                u'please report this issue on GitHub.' % _name)
+            raise ExtractorError(u'Unable to extract %s' % _name)
         else:
             self._downloader.report_warning(u'unable to extract %s; '
                 u'please report this issue on GitHub.' % _name)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 3a8dcf4d3c..718ee3aae0 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -472,6 +472,7 @@ class ExtractorError(Exception):
     """Error during info extraction."""
     def __init__(self, msg, tb=None):
         """ tb, if given, is the original traceback (so that it can be printed out). """
+        msg = msg + u'; please report this issue on GitHub.'
         super(ExtractorError, self).__init__(msg)
         self.traceback = tb
         self.exc_info = sys.exc_info()  # preserve original exception

From d5979c5d55b0df11973b9a2b6630fd676e5726d1 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Sun, 9 Jun 2013 11:55:08 +0200
Subject: [PATCH 08/12] do not ask the user to report network errors

---
 youtube_dl/utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 718ee3aae0..66ae41e319 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -12,7 +12,7 @@ import sys
 import traceback
 import zlib
 import email.utils
-import json
+import socket
 import datetime
 
 try:
@@ -472,8 +472,11 @@ class ExtractorError(Exception):
     """Error during info extraction."""
     def __init__(self, msg, tb=None):
         """ tb, if given, is the original traceback (so that it can be printed out). """
-        msg = msg + u'; please report this issue on GitHub.'
+
+        if not sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
+            msg = msg + u'; please report this issue on GitHub.'
         super(ExtractorError, self).__init__(msg)
+
         self.traceback = tb
         self.exc_info = sys.exc_info()  # preserve original exception
 

From 979a9dd4c4d46e0f2b11bc4bcac51ad8d446d186 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Sun, 9 Jun 2013 11:57:13 +0200
Subject: [PATCH 09/12] _html_search_regex with clean_html superpowers

---
 test/tests.json              |   2 +-
 youtube_dl/InfoExtractors.py | 151 ++++++++++++++++-------------------
 2 files changed, 72 insertions(+), 81 deletions(-)

diff --git a/test/tests.json b/test/tests.json
index c39d1d9c1c..82da27d5b5 100644
--- a/test/tests.json
+++ b/test/tests.json
@@ -325,7 +325,7 @@
     "file": "wshh6a7q1ny0G34ZwuIO.mp4",
     "md5": "9d04de741161603bf7071bbf4e883186",
     "info_dict": {
-        "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick! "
+        "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
     }
   },
   {
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 86cc7c7484..6060a5988c 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -222,6 +222,16 @@ class InfoExtractor(object):
                 u'please report this issue on GitHub.' % _name)
             return None
 
+    def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+        """
+        Like _search_regex, but strips HTML tags and unescapes entities.
+        """
+        res = self._search_regex(pattern, string, name, default, fatal, flags)
+        if res:
+            return clean_html(res).strip()
+        else:
+            return res
+
 class SearchInfoExtractor(InfoExtractor):
     """
     Base class for paged search queries extractors.
@@ -1923,9 +1933,8 @@ class FacebookIE(InfoExtractor):
         video_duration = int(video_data['video_duration'])
         thumbnail = video_data['thumbnail_src']
 
-        video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
+        video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
             webpage, u'title')
-        video_title = unescapeHTML(video_title)
 
         info = {
             'id': video_id,
@@ -2087,7 +2096,7 @@ class MyVideoIE(InfoExtractor):
             self.report_extraction(video_id)
             video_url = mobj.group(1) + '.flv'
 
-            video_title = self._search_regex('<title>([^<]+)</title>',
+            video_title = self._html_search_regex('<title>([^<]+)</title>',
                 webpage, u'title')
 
             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
@@ -2169,7 +2178,7 @@ class MyVideoIE(InfoExtractor):
         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
 
-        video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
+        video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
             webpage, u'title')
 
         return [{
@@ -2371,17 +2380,14 @@ class EscapistIE(InfoExtractor):
         self.report_extraction(showName)
         webpage = self._download_webpage(url, showName)
 
-        videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
+        videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
             webpage, u'description', fatal=False)
-        if videoDesc: videoDesc = unescapeHTML(videoDesc)
 
-        imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
+        imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
             webpage, u'thumbnail', fatal=False)
-        if imgUrl: imgUrl = unescapeHTML(imgUrl)
 
-        playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
+        playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
             webpage, u'player url')
-        playerUrl = unescapeHTML(playerUrl)
 
         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
         configUrl = compat_urllib_parse.unquote(configUrl)
@@ -2499,7 +2505,7 @@ class XVideosIE(InfoExtractor):
             webpage, u'video URL'))
 
         # Extract title
-        video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
+        video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
             webpage, u'title')
 
         # Extract video thumbnail
@@ -2665,7 +2671,7 @@ class InfoQIE(InfoExtractor):
             webpage, u'title')
 
         # Extract description
-        video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
+        video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
             webpage, u'description', fatal=False)
 
         video_filename = video_url.split('/')[-1]
@@ -2837,12 +2843,10 @@ class StanfordOpenClassroomIE(InfoExtractor):
                                         note='Downloading course info page',
                                         errnote='Unable to download course info page')
 
-            info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
-            info['title'] = unescapeHTML(info['title'])
+            info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
 
-            info['description'] = self._search_regex('<description>([^<]+)</description>',
+            info['description'] = self._html_search_regex('<description>([^<]+)</description>',
                 coursepage, u'description', fatal=False)
-            if info['description']: info['description'] = unescapeHTML(info['description'])
 
             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
             info['list'] = [
@@ -2903,15 +2907,13 @@ class MTVIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
+        song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
             webpage, u'song name', fatal=False)
-        if song_name: song_name = unescapeHTML(song_name)
 
-        video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
+        video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
             webpage, u'title')
-        video_title = unescapeHTML(video_title)
 
-        mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
+        mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
             webpage, u'mtvn_uri', fatal=False)
 
         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
@@ -3067,7 +3069,7 @@ class XNXXIE(InfoExtractor):
             webpage, u'video URL')
         video_url = compat_urllib_parse.unquote(video_url)
 
-        video_title = self._search_regex(self.VIDEO_TITLE_RE,
+        video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
             webpage, u'title')
 
         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
@@ -3108,7 +3110,7 @@ class GooglePlusIE(InfoExtractor):
         self.report_extraction(video_id)
 
         # Extract update date
-        upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
+        upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
             webpage, u'upload date', fatal=False)
         if upload_date:
             # Convert timestring to a format suitable for filename
@@ -3116,12 +3118,12 @@ class GooglePlusIE(InfoExtractor):
             upload_date = upload_date.strftime('%Y%m%d')
 
         # Extract uploader
-        uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
+        uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
             webpage, u'uploader', fatal=False)
 
         # Extract title
         # Get the first line for title
-        video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
+        video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
             webpage, 'title', default=u'NA')
 
         # Step 2, Stimulate clicking the image box to launch video
@@ -3175,13 +3177,13 @@ class NBAIE(InfoExtractor):
         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
 
         shortened_video_id = video_id.rpartition('/')[2]
-        title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
+        title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
 
         # It isn't there in the HTML it returns to us
-        # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
+        # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
 
-        description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
+        description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
 
         info = {
             'id': shortened_video_id,
@@ -3337,17 +3339,14 @@ class FunnyOrDieIE(InfoExtractor):
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
 
-        video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
+        video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
             webpage, u'video URL', flags=re.DOTALL)
-        video_url = unescapeHTML(video_url)
 
-        title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
+        title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
-        title = clean_html(title)
 
-        video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
             webpage, u'description', fatal=False, flags=re.DOTALL)
-        if video_description: video_description = unescapeHTML(video_description)
 
         info = {
             'id': video_id,
@@ -3416,14 +3415,13 @@ class UstreamIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-        video_title = self._search_regex(r'data-title="(?P<title>.+)"',
+        video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
             webpage, u'title')
 
-        uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
+        uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
             webpage, u'uploader', fatal=False, flags=re.DOTALL)
-        if uploader: uploader = unescapeHTML(uploader.strip())
 
-        thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
+        thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
             webpage, u'thumbnail', fatal=False)
 
         info = {
@@ -3454,11 +3452,11 @@ class WorldStarHipHopIE(InfoExtractor):
         else:
             ext = 'flv'
 
-        video_title = self._search_regex(r"<title>(.*)</title>",
+        video_title = self._html_search_regex(r"<title>(.*)</title>",
             webpage_src, u'title')
 
         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
-        thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
+        thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
             webpage_src, u'thumbnail', fatal=False)
 
         if not thumbnail:
@@ -3640,7 +3638,7 @@ class PornotubeIE(InfoExtractor):
 
         #Get the uploaded date
         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
-        upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
+        upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
         if upload_date: upload_date = unified_strdate(upload_date)
 
         info = {'id': video_id,
@@ -3668,7 +3666,7 @@ class YouJizzIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         # Get the video title
-        video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
+        video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
             webpage, u'title').strip()
 
         # Get the embed page
@@ -3747,13 +3745,11 @@ class KeekIE(InfoExtractor):
         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
         webpage = self._download_webpage(url, video_id)
 
-        video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
             webpage, u'title')
-        video_title = unescapeHTML(video_title)
 
-        uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
+        uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
             webpage, u'uploader', fatal=False)
-        if uploader: uploader = clean_html(uploader)
 
         info = {
                 'id': video_id,
@@ -3907,9 +3903,8 @@ class SpiegelIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
+        video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
             webpage, u'title')
-        video_title = unescapeHTML(video_title)
 
         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
         xml_code = self._download_webpage(xml_url, video_id,
@@ -3948,15 +3943,13 @@ class LiveLeakIE(InfoExtractor):
         video_url = self._search_regex(r'file: "(.*?)",',
             webpage, u'video URL')
 
-        video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
-            webpage, u'title')
-        video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+            webpage, u'title').replace('LiveLeak.com -', '').strip()
 
-        video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
             webpage, u'description', fatal=False)
-        if video_description: video_description = unescapeHTML(video_description)
 
-        video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
+        video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
             webpage, u'uploader', fatal=False)
 
         info = {
@@ -4033,9 +4026,8 @@ class TumblrIE(InfoExtractor):
 
         # The only place where you can get a title, it's not complete,
         # but searching in other places doesn't work for all videos
-        video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
+        video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
             webpage, u'title', flags=re.DOTALL)
-        video_title = unescapeHTML(video_title)
 
         return [{'id': video_id,
                  'url': video_url,
@@ -4105,10 +4097,10 @@ class RedTubeIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-        video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
+        video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
             webpage, u'video URL')
 
-        video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
+        video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
             webpage, u'title')
 
         return [{
@@ -4132,7 +4124,7 @@ class InaIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-        video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
+        video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
             webpage, u'video URL')
 
         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
@@ -4161,13 +4153,13 @@ class HowcastIE(InfoExtractor):
         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
             webpage, u'video URL')
 
-        video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
+        video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
             webpage, u'title')
 
-        video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
+        video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
             webpage, u'description', fatal=False)
 
-        thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
+        thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
             webpage, u'thumbnail', fatal=False)
 
         return [{
@@ -4192,16 +4184,16 @@ class VineIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-        video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
+        video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
             webpage, u'video URL')
 
-        video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
             webpage, u'title')
 
-        thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
+        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
             webpage, u'thumbnail', fatal=False)
 
-        uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
+        uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 
         return [{
@@ -4230,7 +4222,7 @@ class FlickrIE(InfoExtractor):
         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 
-        node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
+        node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
             first_xml, u'node_id')
 
         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
@@ -4243,13 +4235,13 @@ class FlickrIE(InfoExtractor):
             raise ExtractorError(u'Unable to extract video url')
         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 
-        video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
+        video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
             webpage, u'video title')
 
-        video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
+        video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
             webpage, u'description', fatal=False)
 
-        thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
+        thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
             webpage, u'thumbnail', fatal=False)
 
         return [{
@@ -4272,24 +4264,24 @@ class TeamcocoIE(InfoExtractor):
         url_title = mobj.group('url_title')
         webpage = self._download_webpage(url, url_title)
 
-        video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
+        video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
             webpage, u'video id')
 
         self.report_extraction(video_id)
 
-        video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
             webpage, u'title')
 
-        thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
+        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
             webpage, u'thumbnail', fatal=False)
 
-        video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
+        video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
             webpage, u'description', fatal=False)
 
         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 
-        video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
+        video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
             data, u'video URL')
 
         return [{
@@ -4321,12 +4313,11 @@ class XHamsterIE(InfoExtractor):
             video_url = mobj.group('server')+'/key='+mobj.group('file')
         video_extension = video_url.split('.')[-1]
 
-        video_title = self._search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
+        video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
             webpage, u'title')
-        video_title = unescapeHTML(video_title)
 
         # Can't see the description anywhere in the UI
-        # video_description = self._search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
+        # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
         #     webpage, u'description', fatal=False)
         # if video_description: video_description = unescapeHTML(video_description)
 
@@ -4337,7 +4328,7 @@ class XHamsterIE(InfoExtractor):
             video_upload_date = None
             self._downloader.report_warning(u'Unable to extract upload date')
 
-        video_uploader_id = self._search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
+        video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
             webpage, u'uploader id', default=u'anonymous')
 
         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
@@ -4373,7 +4364,7 @@ class HypemIE(InfoExtractor):
 
         self.report_extraction(track_id)
 
-        html_tracks = self._search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
+        html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
         try:
             track_list = json.loads(html_tracks)

From 78d3442b1209d3858cfea1f7ca958f661784b5ab Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Sun, 9 Jun 2013 14:21:42 +0200
Subject: [PATCH 10/12] test: extend the reach of info_dict checking

* print the info_dict in a format suitable to easy adding to tests.json during tests if un-tested fields are detected
* make it possible to put the crc32 in tests.json if the field is too long
* complete the "info_dict" fields in existing tests
* fixed the bugs catched doing this
---
 test/test_download.py        |  21 +++-
 test/tests.json              | 185 ++++++++++++++++++++++++++++-------
 youtube_dl/InfoExtractors.py |  17 ++--
 3 files changed, 177 insertions(+), 46 deletions(-)

diff --git a/test/test_download.py b/test/test_download.py
index 565b1ebc55..8621520336 100644
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -7,8 +7,8 @@ import os
 import json
 import unittest
 import sys
-import hashlib
 import socket
+import binascii
 
 # Allow direct execution
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -38,6 +38,9 @@ def _try_rm(filename):
         if ose.errno != errno.ENOENT:
             raise
 
+def crc32(value):
+    return '%08x' % (binascii.crc32(value.encode('utf8')) & 0xffffffff)
+
 class FileDownloader(youtube_dl.FileDownloader):
     def __init__(self, *args, **kwargs):
         self.to_stderr = self.to_screen
@@ -124,7 +127,21 @@ def generator(test_case):
                 with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof:
                     info_dict = json.load(infof)
                 for (info_field, value) in tc.get('info_dict', {}).items():
-                    self.assertEqual(value, info_dict.get(info_field))
+                    if isinstance(value, compat_str) and value.startswith('crc32:'):
+                        self.assertEqual(value, 'crc32:' + crc32(info_dict.get(info_field)))
+                    else:
+                        self.assertEqual(value, info_dict.get(info_field))
+
+                # If checkable fields are missing from the test case, print the info_dict
+                test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'crc32:' + crc32(value))
+                    for key, value in info_dict.items()
+                    if value and key in ('title', 'description', 'uploader', 'upload_date', 'uploader_id', 'location'))
+                if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()):
+                    sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=2) + u'\n')
+
+                # Check for the presence of mandatory fields
+                for key in ('id', 'url', 'title', 'ext'):
+                    self.assertTrue(key in info_dict.keys() and info_dict[key])
         finally:
             for tc in test_cases:
                 _try_rm(tc['file'])
diff --git a/test/tests.json b/test/tests.json
index 82da27d5b5..e9abb0950f 100644
--- a/test/tests.json
+++ b/test/tests.json
@@ -15,43 +15,76 @@
     "name": "Dailymotion",
     "md5":  "392c4b85a60a90dc4792da41ce3144eb",
     "url":  "http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech",
-    "file":  "x33vw9.mp4"
+    "file":  "x33vw9.mp4",
+    "info_dict": {
+      "uploader": "Alex and Van .",
+      "title": "Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\""
+    }
   },
   {
     "name": "Metacafe",
     "add_ie": ["Youtube"],
     "url":  "http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/",
-    "file":  "_aUehQsCQtM.flv"
+    "file":  "_aUehQsCQtM.flv",
+    "info_dict": {
+      "upload_date": "20090102",
+      "title": "The Electric Company | \"Short I\" | PBS KIDS GO!",
+      "description": "crc32:5ef3bc57",
+      "uploader": "PBS",
+      "uploader_id": "PBS"
+    }
   },
   {
     "name": "BlipTV",
     "md5":  "b2d849efcf7ee18917e4b4d9ff37cafe",
     "url":  "http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352",
-    "file":  "5779306.m4v"
+    "file":  "5779306.m4v",
+    "info_dict": {
+      "upload_date": "20111205",
+      "description": "crc32:fa658d49",
+      "uploader": "Comic Book Resources - CBR TV",
+      "title": "CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3"
+    }
   },
   {
     "name": "XVideos",
     "md5":  "1d0c835822f0a71a7bf011855db929d0",
     "url":  "http://www.xvideos.com/video939581/funny_porns_by_s_-1",
-    "file":  "939581.flv"
+    "file":  "939581.flv",
+    "info_dict": {
+      "title": "Funny Porns By >>>>S<<<<<< -1"
+    }
   },
   {
     "name": "YouPorn",
     "md5": "c37ddbaaa39058c76a7e86c6813423c1",
     "url": "http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/",
-    "file": "505835.mp4"
+    "file": "505835.mp4",
+    "info_dict": {
+      "upload_date": "20101221",
+      "description": "Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",
+      "uploader": "Ask Dan And Jennifer",
+      "title": "Sex Ed: Is It Safe To Masturbate Daily?"
+    }
   },
   {
     "name": "Pornotube",
     "md5": "374dd6dcedd24234453b295209aa69b6",
     "url": "http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing",
-    "file": "1689755.flv"
+    "file": "1689755.flv",
+    "info_dict": {
+      "upload_date": "20090708",
+      "title": "Marilyn-Monroe-Bathing"
+    }
   },
   {
     "name": "YouJizz",
     "md5": "07e15fa469ba384c7693fd246905547c",
     "url": "http://www.youjizz.com/videos/zeichentrick-1-2189178.html",
-    "file": "2189178.flv"
+    "file": "2189178.flv",
+    "info_dict": {
+      "title": "Zeichentrick 1"
+    }
   },
   {
     "name": "Vimeo",
@@ -70,61 +103,103 @@
     "name": "Soundcloud",
     "md5":  "ebef0a451b909710ed1d7787dddbf0d7",
     "url":  "http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy",
-    "file":  "62986583.mp3"
+    "file":  "62986583.mp3",
+    "info_dict": {
+      "upload_date": "20121011",
+      "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",
+      "uploader": "E.T. ExTerrestrial Music",
+      "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1"
+    }
   },
   {
     "name": "StanfordOpenClassroom",
     "md5":  "544a9468546059d4e80d76265b0443b8",
     "url":  "http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100",
-    "file":  "PracticalUnix_intro-environment.mp4"
+    "file":  "PracticalUnix_intro-environment.mp4",
+    "info_dict": {
+      "title": "Intro Environment"
+    }
   },
   {
     "name": "XNXX",
     "md5":  "0831677e2b4761795f68d417e0b7b445",
     "url":  "http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_",
-    "file":  "1135332.flv"
+    "file":  "1135332.flv",
+    "info_dict": {
+      "title": "lida » Naked Funny Actress  (5)"
+    }
   },
   {
     "name": "Youku",
     "url": "http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",
     "file": "XNDgyMDQ2NTQw_part00.flv",
     "md5": "ffe3f2e435663dc2d1eea34faeff5b5b",
-    "params": { "test": false }
+    "params": { "test": false },
+    "info_dict": {
+      "title": "youtube-dl test video \"'/\\ä↭𝕐"
+    }
   },
   {
     "name": "NBA",
     "url": "http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html",
     "file": "0021200253-okc-bkn-recap.nba.mp4",
-    "md5": "c0edcfc37607344e2ff8f13c378c88a4"
+    "md5": "c0edcfc37607344e2ff8f13c378c88a4",
+    "info_dict": {
+      "description": "Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.",
+      "title": "Thunder vs. Nets"
+    }
   },
   {
     "name": "JustinTV",
     "url": "http://www.twitch.tv/thegamedevhub/b/296128360",
     "file": "296128360.flv",
-    "md5": "ecaa8a790c22a40770901460af191c9a"
+    "md5": "ecaa8a790c22a40770901460af191c9a",
+    "info_dict": {
+      "upload_date": "20110927",
+      "uploader_id": 25114803,
+      "uploader": "thegamedevhub",
+      "title": "Beginner Series - Scripting With Python Pt.1"
+    }
   },
   {
     "name": "MyVideo",
     "url": "http://www.myvideo.de/watch/8229274/bowling_fail_or_win",
     "file": "8229274.flv",
-    "md5": "2d2753e8130479ba2cb7e0a37002053e"
+    "md5": "2d2753e8130479ba2cb7e0a37002053e",
+    "info_dict": {
+      "title": "bowling-fail-or-win"
+    }
   },
   {
     "name": "Escapist",
     "url": "http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate",
     "file": "6618-Breaking-Down-Baldurs-Gate.mp4",
-    "md5": "c6793dbda81388f4264c1ba18684a74d"
+    "md5": "c6793dbda81388f4264c1ba18684a74d",
+    "info_dict": {
+      "description": "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
+      "uploader": "the-escapist-presents",
+      "title": "Breaking Down Baldur's Gate"
+    }
   },
   {
     "name": "GooglePlus",
     "url": "https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH",
-    "file": "ZButuJc6CtH.flv"
+    "file": "ZButuJc6CtH.flv",
+    "info_dict": {
+      "upload_date": "20120613",
+      "uploader": "井上ヨシマサ",
+      "title": "嘆きの天使 降臨"
+    }
   },
   {
     "name": "FunnyOrDie",
     "url": "http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version",
     "file": "0732f586d7.mp4",
-    "md5": "f647e9e90064b53b6e046e75d0241fbd"
+    "md5": "f647e9e90064b53b6e046e75d0241fbd",
+    "info_dict": {
+      "description": "Lyrics changed to match the video. Spoken cameo by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a concept by Dustin McLean (DustFilms.com). Performed, edited, and written by David A. Scott.",
+      "title": "Heart-Shaped Box: Literal Video Version"
+    }
   },
   {
     "name": "Steam",
@@ -161,6 +236,7 @@
     "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things",
     "file": "12-jan-pythonthings.mp4",
     "info_dict": {
+      "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.",
       "title": "A Few of My Favorite [Python] Things"
     },
     "params": {
@@ -173,7 +249,10 @@
     "file": "422212.mp4",
     "md5": "4e2f5cb088a83cd8cdb7756132f9739d",
     "info_dict": {
-        "title": "thedailyshow-kristen-stewart part 1"
+      "upload_date": "20121214",
+      "description": "Kristen Stewart",
+      "uploader": "thedailyshow",
+      "title": "thedailyshow-kristen-stewart part 1"
     }
   },
   {
@@ -224,42 +303,48 @@
         "file": "11885679.m4a",
         "md5": "d30b5b5f74217410f4689605c35d1fd7",
         "info_dict": {
-          "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad"
+          "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad",
+          "uploader_id": "ytdl"
         }
       },
       {
         "file": "11885680.m4a",
         "md5": "4eb0a669317cd725f6bbd336a29f923a",
         "info_dict": {
-          "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad"
+          "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad",
+          "uploader_id": "ytdl"
         }
       },
       {
         "file": "11885682.m4a",
         "md5": "1893e872e263a2705558d1d319ad19e8",
         "info_dict": {
-          "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad"
+          "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad",
+          "uploader_id": "ytdl"
         }
       },
       {
         "file": "11885683.m4a",
         "md5": "b673c46f47a216ab1741ae8836af5899",
         "info_dict": {
-          "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad"
+          "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad",
+          "uploader_id": "ytdl"
         }
       },
       {
         "file": "11885684.m4a",
         "md5": "1d74534e95df54986da7f5abf7d842b7",
         "info_dict": {
-          "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad"
+          "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad",
+          "uploader_id": "ytdl"
         }
       },
       {
         "file": "11885685.m4a",
         "md5": "f081f47af8f6ae782ed131d38b9cd1c0",
         "info_dict": {
-          "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad"
+          "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad",
+          "uploader_id": "ytdl"
         }
       }
     ]
@@ -270,9 +355,9 @@
     "file": "NODfbab.mp4",
     "md5": "9b0636f8c0f7614afa4ea5e4c6e57e83",
     "info_dict": {
+      "uploader": "ytdl",
       "title": "test chars: \"'/\\ä<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de ."
     }
-
   },
   {
     "name": "TED",
@@ -290,14 +375,19 @@
     "file": "11741.mp4",
     "md5": "0b49f4844a068f8b33f4b7c88405862b",
     "info_dict": {
-        "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2"
+      "description": "Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?",
+      "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2"
     }
   },
   {
     "name": "Generic",
     "url": "http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html",
     "file": "13601338388002.mp4",
-    "md5": "85b90ccc9d73b4acd9138d3af4c27f89"
+    "md5": "85b90ccc9d73b4acd9138d3af4c27f89",
+    "info_dict": {
+      "uploader": "www.hodiho.fr",
+      "title": "Régis plante sa Jeep"
+    }
   },
   {
     "name": "Spiegel",
@@ -355,42 +445,59 @@
         "file":"30510138.mp3",
         "md5":"f9136bf103901728f29e419d2c70f55d",
         "info_dict": {
-          "title":"D-D-Dance"
+          "upload_date": "20111213",
+          "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
+          "uploader": "The Royal Concept",
+          "title": "D-D-Dance"
         }
       },
       {
         "file":"47127625.mp3",
         "md5":"09b6758a018470570f8fd423c9453dd8",
         "info_dict": {
-          "title":"The Royal Concept - Gimme Twice"
+          "upload_date": "20120521",
+          "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
+          "uploader": "The Royal Concept",
+          "title": "The Royal Concept - Gimme Twice"
         }
       },
       {
         "file":"47127627.mp3",
         "md5":"154abd4e418cea19c3b901f1e1306d9c",
         "info_dict": {
-          "title":"Goldrushed"
+          "upload_date": "20120521",
+          "uploader": "The Royal Concept",
+          "title": "Goldrushed"
         }
       },
       {
         "file":"47127629.mp3",
         "md5":"2f5471edc79ad3f33a683153e96a79c1",
         "info_dict": {
-          "title":"In the End"
+          "upload_date": "20120521",
+          "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
+          "uploader": "The Royal Concept",
+          "title": "In the End"
         }
       },
       {
         "file":"47127631.mp3",
         "md5":"f9ba87aa940af7213f98949254f1c6e2",
         "info_dict": {
-          "title":"Knocked Up"
+          "upload_date": "20120521",
+          "description": "The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com",
+          "uploader": "The Royal Concept",
+          "title": "Knocked Up"
         }
       },
       {
         "file":"75206121.mp3",
         "md5":"f9d1fe9406717e302980c30de4af9353",
         "info_dict": {
-          "title":"World On Fire"
+          "upload_date": "20130116",
+          "description": "The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central).  \r\nAs a gift to our fans we would like to offer you a free download of the track!  ",
+          "uploader": "The Royal Concept",
+          "title": "World On Fire"
         }
       }
     ]
@@ -419,8 +526,10 @@
     "url": "http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0",
     "file": "zpsc0c3b9fa.mp4",
     "md5": "7dabfb92b0a31f6c16cebc0f8e60ff99",
-    "info_dict":{
-      "title":"Tired of Link Building? Try BacklinkMyDomain.com!"
+    "info_dict": {
+      "upload_date": "20130504",
+      "uploader": "rachaneronas",
+      "title": "Tired of Link Building? Try BacklinkMyDomain.com!"
     }
   },
   {
@@ -488,8 +597,10 @@
     "url": "http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html",
     "file": "1509445.flv",
     "md5": "9f48e0e8d58e3076bb236ff412ab62fa",
-    "info_dict":{
-      "title":"FemaleAgent Shy beauty takes the bait"
+    "info_dict": {
+      "upload_date": "20121014",
+      "uploader_id": "Ruseful2011",
+      "title": "FemaleAgent Shy beauty takes the bait"
     }
   },
   {
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 6060a5988c..24e9c4cc7b 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -2377,8 +2377,8 @@ class EscapistIE(InfoExtractor):
         showName = mobj.group('showname')
         videoId = mobj.group('episode')
 
-        self.report_extraction(showName)
-        webpage = self._download_webpage(url, showName)
+        self.report_extraction(videoId)
+        webpage = self._download_webpage(url, videoId)
 
         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
             webpage, u'description', fatal=False)
@@ -2389,10 +2389,13 @@ class EscapistIE(InfoExtractor):
         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
             webpage, u'player url')
 
+        title = self._html_search_regex('<meta name="title" content="([^"]*)"',
+            webpage, u'player url').split(' : ')[-1]
+
         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
         configUrl = compat_urllib_parse.unquote(configUrl)
 
-        configJSON = self._download_webpage(configUrl, showName,
+        configJSON = self._download_webpage(configUrl, videoId,
                                             u'Downloading configuration',
                                             u'unable to download configuration')
 
@@ -2412,7 +2415,7 @@ class EscapistIE(InfoExtractor):
             'url': videoUrl,
             'uploader': showName,
             'upload_date': None,
-            'title': showName,
+            'title': title,
             'ext': 'mp4',
             'thumbnail': imgUrl,
             'description': videoDesc,
@@ -3581,14 +3584,14 @@ class YouPornIE(InfoExtractor):
             size = format[0]
             bitrate = format[1]
             format = "-".join( format )
-            title = u'%s-%s-%s' % (video_title, size, bitrate)
+            # title = u'%s-%s-%s' % (video_title, size, bitrate)
 
             formats.append({
                 'id': video_id,
                 'url': video_url,
                 'uploader': video_uploader,
                 'upload_date': upload_date,
-                'title': title,
+                'title': video_title,
                 'ext': extension,
                 'format': format,
                 'thumbnail': thumbnail,
@@ -4328,7 +4331,7 @@ class XHamsterIE(InfoExtractor):
             video_upload_date = None
             self._downloader.report_warning(u'Unable to extract upload date')
 
-        video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
+        video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
             webpage, u'uploader id', default=u'anonymous')
 
         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',

From ee55fcbe121baa0dacc9f87b9aa3abd974291355 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Sun, 9 Jun 2013 15:03:54 +0200
Subject: [PATCH 11/12] switch long info_dict fields checking to md5

---
 test/test_download.py | 9 ++++-----
 test/tests.json       | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/test/test_download.py b/test/test_download.py
index 8621520336..577bcdbf2d 100644
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -38,8 +38,7 @@ def _try_rm(filename):
         if ose.errno != errno.ENOENT:
             raise
 
-def crc32(value):
-    return '%08x' % (binascii.crc32(value.encode('utf8')) & 0xffffffff)
+md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
 
 class FileDownloader(youtube_dl.FileDownloader):
     def __init__(self, *args, **kwargs):
@@ -127,13 +126,13 @@ def generator(test_case):
                 with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof:
                     info_dict = json.load(infof)
                 for (info_field, value) in tc.get('info_dict', {}).items():
-                    if isinstance(value, compat_str) and value.startswith('crc32:'):
-                        self.assertEqual(value, 'crc32:' + crc32(info_dict.get(info_field)))
+                    if isinstance(value, compat_str) and value.startswith('md5:'):
+                        self.assertEqual(value, 'md5:' + md5(info_dict.get(info_field)))
                     else:
                         self.assertEqual(value, info_dict.get(info_field))
 
                 # If checkable fields are missing from the test case, print the info_dict
-                test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'crc32:' + crc32(value))
+                test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
                     for key, value in info_dict.items()
                     if value and key in ('title', 'description', 'uploader', 'upload_date', 'uploader_id', 'location'))
                 if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()):
diff --git a/test/tests.json b/test/tests.json
index e9abb0950f..8a3e8e8e12 100644
--- a/test/tests.json
+++ b/test/tests.json
@@ -29,7 +29,7 @@
     "info_dict": {
       "upload_date": "20090102",
       "title": "The Electric Company | \"Short I\" | PBS KIDS GO!",
-      "description": "crc32:5ef3bc57",
+      "description": "md5:2439a8ef6d5a70e380c22f5ad323e5a8",
       "uploader": "PBS",
       "uploader_id": "PBS"
     }
@@ -41,7 +41,7 @@
     "file":  "5779306.m4v",
     "info_dict": {
       "upload_date": "20111205",
-      "description": "crc32:fa658d49",
+      "description": "md5:9bc31f227219cde65e47eeec8d2dc596",
       "uploader": "Comic Book Resources - CBR TV",
       "title": "CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3"
     }

From af44c9486255f16ab180a9e45aaab06a6b38bdde Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Mon, 17 Jun 2013 19:25:35 +0200
Subject: [PATCH 12/12] use _search_regex in GenericIE

---
 youtube_dl/InfoExtractors.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 24e9c4cc7b..3c95012b19 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -1430,16 +1430,12 @@ class GenericIE(InfoExtractor):
         #   Site Name | Video Title
         #   Video Title - Tagline | Site Name
         # and so on and so forth; it's just not practical
-        mobj = re.search(r'<title>(.*)</title>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1)
+        video_title = self._html_search_regex(r'<title>(.*)</title>',
+            webpage, u'video title')
 
         # video uploader is domain name
-        mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_uploader = mobj.group(1)
+        video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
+            url, u'video uploader')
 
         return [{
             'id':       video_id,