2012-03-24 21:07:37 -04:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
2014-03-23 20:40:09 -04:00
import calendar
2014-04-04 17:00:51 -04:00
import codecs
2014-02-24 19:43:17 -05:00
import contextlib
2013-12-15 23:04:12 -05:00
import ctypes
2013-08-28 06:57:10 -04:00
import datetime
import email . utils
2013-05-13 03:20:08 -04:00
import errno
2014-03-18 09:27:42 -04:00
import getpass
2012-03-24 21:07:37 -04:00
import gzip
2014-01-20 05:36:47 -05:00
import itertools
2012-11-27 18:09:17 -05:00
import io
2012-12-20 07:13:24 -05:00
import json
2012-03-24 21:07:37 -04:00
import locale
2013-11-24 21:12:26 -05:00
import math
2012-03-24 21:07:37 -04:00
import os
2013-10-12 07:49:27 -04:00
import pipes
2013-08-28 06:57:10 -04:00
import platform
2012-03-24 21:07:37 -04:00
import re
2013-11-24 00:37:14 -05:00
import ssl
2013-08-28 06:57:10 -04:00
import socket
2014-02-15 10:24:43 -05:00
import struct
2013-12-09 12:29:07 -05:00
import subprocess
2012-03-24 21:07:37 -04:00
import sys
2014-08-21 07:01:13 -04:00
import tempfile
2013-01-03 09:39:55 -05:00
import traceback
2014-03-10 12:31:32 -04:00
import xml . etree . ElementTree
2012-03-24 21:07:37 -04:00
import zlib
2012-11-27 17:54:09 -05:00
try :
2012-11-27 20:04:46 -05:00
import urllib . request as compat_urllib_request
2012-11-27 17:54:09 -05:00
except ImportError : # Python 2
2012-11-27 20:04:46 -05:00
import urllib2 as compat_urllib_request
2012-11-27 17:54:09 -05:00
try :
2012-11-27 20:04:46 -05:00
import urllib . error as compat_urllib_error
2012-11-27 17:54:09 -05:00
except ImportError : # Python 2
2012-11-27 20:04:46 -05:00
import urllib2 as compat_urllib_error
2012-11-27 17:54:09 -05:00
try :
2012-11-27 20:04:46 -05:00
import urllib . parse as compat_urllib_parse
2012-11-27 17:54:09 -05:00
except ImportError : # Python 2
2012-11-27 20:04:46 -05:00
import urllib as compat_urllib_parse
2012-11-27 17:54:09 -05:00
2012-11-27 22:51:27 -05:00
try :
from urllib . parse import urlparse as compat_urllib_parse_urlparse
except ImportError : # Python 2
from urlparse import urlparse as compat_urllib_parse_urlparse
2013-07-12 08:53:28 -04:00
try :
import urllib . parse as compat_urlparse
except ImportError : # Python 2
import urlparse as compat_urlparse
2012-11-27 17:54:09 -05:00
try :
2012-11-27 20:04:46 -05:00
import http . cookiejar as compat_cookiejar
2012-11-27 17:54:09 -05:00
except ImportError : # Python 2
2012-11-27 20:04:46 -05:00
import cookielib as compat_cookiejar
2012-11-27 17:54:09 -05:00
2012-11-27 18:02:55 -05:00
try :
2012-11-27 20:04:46 -05:00
import html . entities as compat_html_entities
2012-11-27 18:17:12 -05:00
except ImportError : # Python 2
2012-11-27 20:04:46 -05:00
import htmlentitydefs as compat_html_entities
2012-11-27 18:02:55 -05:00
2012-11-27 18:06:28 -05:00
try :
2012-11-27 20:04:46 -05:00
import html . parser as compat_html_parser
2012-11-27 18:17:12 -05:00
except ImportError : # Python 2
2012-11-27 20:04:46 -05:00
import HTMLParser as compat_html_parser
2012-11-27 18:06:28 -05:00
2012-11-27 18:13:00 -05:00
try :
2012-11-27 20:04:46 -05:00
import http . client as compat_http_client
2012-11-27 18:17:12 -05:00
except ImportError : # Python 2
2012-11-27 20:04:46 -05:00
import httplib as compat_http_client
2012-11-27 18:13:00 -05:00
2013-08-27 22:25:38 -04:00
try :
2013-08-28 04:18:39 -04:00
from urllib . error import HTTPError as compat_HTTPError
2013-08-27 22:25:38 -04:00
except ImportError : # Python 2
from urllib2 import HTTPError as compat_HTTPError
2013-09-21 08:19:30 -04:00
try :
from urllib . request import urlretrieve as compat_urlretrieve
except ImportError : # Python 2
from urllib import urlretrieve as compat_urlretrieve
2012-12-16 06:29:03 -05:00
try :
from subprocess import DEVNULL
compat_subprocess_get_DEVNULL = lambda : DEVNULL
except ImportError :
compat_subprocess_get_DEVNULL = lambda : open ( os . path . devnull , ' w ' )
2012-11-27 18:17:12 -05:00
try :
2014-07-21 07:55:47 -04:00
from urllib . parse import unquote as compat_urllib_parse_unquote
except ImportError :
def compat_urllib_parse_unquote ( string , encoding = ' utf-8 ' , errors = ' replace ' ) :
2012-11-27 20:04:46 -05:00
if string == ' ' :
return string
res = string . split ( ' % ' )
if len ( res ) == 1 :
return string
if encoding is None :
encoding = ' utf-8 '
if errors is None :
errors = ' replace '
# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
pct_sequence = b ' '
string = res [ 0 ]
for item in res [ 1 : ] :
try :
if not item :
raise ValueError
pct_sequence + = item [ : 2 ] . decode ( ' hex ' )
rest = item [ 2 : ]
if not rest :
# This segment was just a single percent-encoded character.
# May be part of a sequence of code units, so delay decoding.
# (Stored in pct_sequence).
continue
except ValueError :
rest = ' % ' + item
# Encountered non-percent-encoded characters. Flush the current
# pct_sequence.
string + = pct_sequence . decode ( encoding , errors ) + rest
pct_sequence = b ' '
if pct_sequence :
# Flush the final pct_sequence
string + = pct_sequence . decode ( encoding , errors )
return string
2014-07-21 07:55:47 -04:00
try :
from urllib . parse import parse_qs as compat_parse_qs
except ImportError : # Python 2
# HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
# Python 2's version is apparently totally broken
2012-11-27 20:04:46 -05:00
def _parse_qsl ( qs , keep_blank_values = False , strict_parsing = False ,
encoding = ' utf-8 ' , errors = ' replace ' ) :
qs , _coerce_result = qs , unicode
pairs = [ s2 for s1 in qs . split ( ' & ' ) for s2 in s1 . split ( ' ; ' ) ]
r = [ ]
for name_value in pairs :
if not name_value and not strict_parsing :
continue
nv = name_value . split ( ' = ' , 1 )
if len ( nv ) != 2 :
if strict_parsing :
raise ValueError ( " bad query field: %r " % ( name_value , ) )
# Handle case of a control-name with no equal sign
if keep_blank_values :
nv . append ( ' ' )
else :
continue
if len ( nv [ 1 ] ) or keep_blank_values :
name = nv [ 0 ] . replace ( ' + ' , ' ' )
2014-07-21 07:55:47 -04:00
name = compat_urllib_parse_unquote (
name , encoding = encoding , errors = errors )
2012-11-27 20:04:46 -05:00
name = _coerce_result ( name )
value = nv [ 1 ] . replace ( ' + ' , ' ' )
2014-07-21 07:55:47 -04:00
value = compat_urllib_parse_unquote (
value , encoding = encoding , errors = errors )
2012-11-27 20:04:46 -05:00
value = _coerce_result ( value )
r . append ( ( name , value ) )
return r
def compat_parse_qs ( qs , keep_blank_values = False , strict_parsing = False ,
encoding = ' utf-8 ' , errors = ' replace ' ) :
parsed_result = { }
pairs = _parse_qsl ( qs , keep_blank_values , strict_parsing ,
encoding = encoding , errors = errors )
for name , value in pairs :
if name in parsed_result :
parsed_result [ name ] . append ( value )
else :
parsed_result [ name ] = [ value ]
return parsed_result
2012-11-27 18:13:00 -05:00
2012-11-27 18:02:55 -05:00
try :
2012-11-27 20:04:46 -05:00
compat_str = unicode # Python 2
2012-11-27 18:02:55 -05:00
except NameError :
2012-11-27 20:04:46 -05:00
compat_str = str
2012-11-27 18:02:55 -05:00
try :
2012-11-27 20:04:46 -05:00
compat_chr = unichr # Python 2
2012-11-27 18:02:55 -05:00
except NameError :
2012-11-27 20:04:46 -05:00
compat_chr = chr
2012-11-27 18:02:55 -05:00
2014-02-21 10:59:10 -05:00
try :
from xml . etree . ElementTree import ParseError as compat_xml_parse_error
except ImportError : # Python 2.6
from xml . parsers . expat import ExpatError as compat_xml_parse_error
2014-08-25 04:18:01 -04:00
try :
from shlex import quote as shlex_quote
except ImportError : # Python < 3.3
def shlex_quote ( s ) :
return " ' " + s . replace ( " ' " , " ' \" ' \" ' " ) + " ' "
2013-05-20 05:57:10 -04:00
def compat_ord ( c ) :
if type ( c ) is int : return c
else : return ord ( c )
2013-06-06 08:35:08 -04:00
# This is not clearly defined otherwise
compiled_regex_type = type ( re . compile ( ' ' ) )
2012-11-27 18:02:55 -05:00
std_headers = {
2013-11-18 07:52:24 -05:00
' User-Agent ' : ' Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome) ' ,
2012-11-27 20:04:46 -05:00
' Accept-Charset ' : ' ISO-8859-1,utf-8;q=0.7,*;q=0.7 ' ,
' Accept ' : ' text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ' ,
' Accept-Encoding ' : ' gzip, deflate ' ,
' Accept-Language ' : ' en-us,en;q=0.5 ' ,
2012-11-27 18:02:55 -05:00
}
2012-12-30 12:22:36 -05:00
2012-03-24 21:07:37 -04:00
def preferredencoding ( ) :
2012-11-27 20:04:46 -05:00
""" Get preferred encoding.
2012-03-24 21:07:37 -04:00
2012-11-27 20:04:46 -05:00
Returns the best encoding scheme for the system , based on
locale . getpreferredencoding ( ) and some further tweaks .
"""
try :
pref = locale . getpreferredencoding ( )
u ' TEST ' . encode ( pref )
except :
pref = ' UTF-8 '
2012-07-01 12:21:27 -04:00
2012-11-27 20:04:46 -05:00
return pref
2012-03-24 21:07:37 -04:00
2012-11-27 18:46:21 -05:00
if sys . version_info < ( 3 , 0 ) :
2012-11-27 20:04:46 -05:00
def compat_print ( s ) :
print ( s . encode ( preferredencoding ( ) , ' xmlcharrefreplace ' ) )
2012-11-27 18:46:21 -05:00
else :
2012-11-27 20:04:46 -05:00
def compat_print ( s ) :
assert type ( s ) == type ( u ' ' )
print ( s )
2012-03-24 21:07:37 -04:00
2012-12-20 07:13:24 -05:00
2014-08-21 07:01:13 -04:00
def write_json_file ( obj , fn ) :
""" Encode obj as JSON and write it to fn, atomically """
2014-08-21 11:03:00 -04:00
args = {
' suffix ' : ' .tmp ' ,
' prefix ' : os . path . basename ( fn ) + ' . ' ,
' dir ' : os . path . dirname ( fn ) ,
' delete ' : False ,
}
2014-08-21 07:01:13 -04:00
# In Python 2.x, json.dump expects a bytestream.
# In Python 3.x, it writes to a character stream
if sys . version_info < ( 3 , 0 ) :
2014-08-21 11:03:00 -04:00
args [ ' mode ' ] = ' wb '
2014-08-21 07:01:13 -04:00
else :
2014-08-21 11:03:00 -04:00
args . update ( {
' mode ' : ' w ' ,
' encoding ' : ' utf-8 ' ,
} )
tf = tempfile . NamedTemporaryFile ( * * args )
2014-08-21 07:01:13 -04:00
try :
with tf :
json . dump ( obj , tf )
os . rename ( tf . name , fn )
except :
try :
os . remove ( tf . name )
except OSError :
pass
raise
if sys . version_info > = ( 2 , 7 ) :
2013-07-11 10:12:08 -04:00
def find_xpath_attr ( node , xpath , key , val ) :
""" Find the xpath xpath[@key=val] """
2014-07-25 05:39:17 -04:00
assert re . match ( r ' ^[a-zA-Z-]+$ ' , key )
assert re . match ( r ' ^[a-zA-Z0-9@ \ s:._-]*$ ' , val )
2013-07-11 10:12:08 -04:00
expr = xpath + u " [@ %s = ' %s ' ] " % ( key , val )
return node . find ( expr )
else :
def find_xpath_attr ( node , xpath , key , val ) :
2014-09-13 02:34:15 -04:00
# Here comes the crazy part: In 2.6, if the xpath is a unicode,
# .//node does not match if a node is a direct child of . !
if isinstance ( xpath , unicode ) :
xpath = xpath . encode ( ' ascii ' )
2013-07-11 10:12:08 -04:00
for f in node . findall ( xpath ) :
if f . attrib . get ( key ) == val :
return f
return None
2013-10-12 15:34:04 -04:00
# On python2.6 the xml.etree.ElementTree.Element methods don't support
# the namespace parameter
def xpath_with_ns ( path , ns_map ) :
components = [ c . split ( ' : ' ) for c in path . split ( ' / ' ) ]
replaced = [ ]
for c in components :
if len ( c ) == 1 :
replaced . append ( c [ 0 ] )
else :
ns , tag = c
replaced . append ( ' { %s } %s ' % ( ns_map [ ns ] , tag ) )
return ' / ' . join ( replaced )
2012-03-24 21:07:37 -04:00
2014-09-13 03:09:55 -04:00
def xpath_text ( node , xpath , name = None , fatal = False ) :
2014-09-13 03:11:14 -04:00
if sys . version_info < ( 2 , 7 ) : # Crazy 2.6
xpath = xpath . encode ( ' ascii ' )
2014-09-13 03:09:55 -04:00
n = node . find ( xpath )
if n is None :
if fatal :
name = xpath if name is None else name
raise ExtractorError ( ' Could not find XML element %s ' % name )
else :
return None
return n . text
2012-11-27 18:06:28 -05:00
compat_html_parser . locatestarttagend = re . compile ( r """ <[a-zA-Z][-.a-zA-Z0-9:_]*(?: \ s+(?:(?<=[ ' " \ s])[^ \ s/>][^ \ s/=>]*(?: \ s*=+ \ s*(?: ' [^ ' ]* ' | " [^ " ]* " |(?![ ' " ])[^> \ s]*))? \ s*)*)? \ s* """ , re . VERBOSE ) # backport bugfix
2013-09-13 16:05:29 -04:00
class BaseHTMLParser ( compat_html_parser . HTMLParser ) :
def __init ( self ) :
compat_html_parser . HTMLParser . __init__ ( self )
self . html = None
def loads ( self , html ) :
self . html = html
self . feed ( html )
self . close ( )
class AttrParser ( BaseHTMLParser ) :
2012-12-19 09:21:14 -05:00
""" Modified HTMLParser that isolates a tag with the specified attribute """
def __init__ ( self , attribute , value ) :
self . attribute = attribute
self . value = value
2012-11-27 20:04:46 -05:00
self . result = None
self . started = False
self . depth = { }
self . watch_startpos = False
self . error_count = 0
2013-09-13 16:05:29 -04:00
BaseHTMLParser . __init__ ( self )
2012-11-27 20:04:46 -05:00
def error ( self , message ) :
if self . error_count > 10 or self . started :
raise compat_html_parser . HTMLParseError ( message , self . getpos ( ) )
self . rawdata = ' \n ' . join ( self . html . split ( ' \n ' ) [ self . getpos ( ) [ 0 ] : ] ) # skip one line
self . error_count + = 1
self . goahead ( 1 )
def handle_starttag ( self , tag , attrs ) :
attrs = dict ( attrs )
if self . started :
self . find_startpos ( None )
2012-12-19 09:21:14 -05:00
if self . attribute in attrs and attrs [ self . attribute ] == self . value :
2012-11-27 20:04:46 -05:00
self . result = [ tag ]
self . started = True
self . watch_startpos = True
if self . started :
if not tag in self . depth : self . depth [ tag ] = 0
self . depth [ tag ] + = 1
def handle_endtag ( self , tag ) :
if self . started :
if tag in self . depth : self . depth [ tag ] - = 1
if self . depth [ self . result [ 0 ] ] == 0 :
self . started = False
self . result . append ( self . getpos ( ) )
def find_startpos ( self , x ) :
""" Needed to put the start position of the result (self.result[1])
after the opening tag with the requested id """
if self . watch_startpos :
self . watch_startpos = False
self . result . append ( self . getpos ( ) )
handle_entityref = handle_charref = handle_data = handle_comment = \
handle_decl = handle_pi = unknown_decl = find_startpos
def get_result ( self ) :
if self . result is None :
return None
if len ( self . result ) != 3 :
return None
lines = self . html . split ( ' \n ' )
lines = lines [ self . result [ 1 ] [ 0 ] - 1 : self . result [ 2 ] [ 0 ] ]
lines [ 0 ] = lines [ 0 ] [ self . result [ 1 ] [ 1 ] : ]
if len ( lines ) == 1 :
lines [ - 1 ] = lines [ - 1 ] [ : self . result [ 2 ] [ 1 ] - self . result [ 1 ] [ 1 ] ]
lines [ - 1 ] = lines [ - 1 ] [ : self . result [ 2 ] [ 1 ] ]
return ' \n ' . join ( lines ) . strip ( )
2013-02-01 11:29:50 -05:00
# Hack for https://github.com/rg3/youtube-dl/issues/662
if sys . version_info < ( 2 , 7 , 3 ) :
AttrParser . parse_endtag = ( lambda self , i :
i + len ( " </scr ' + ' ipt> " )
if self . rawdata [ i : ] . startswith ( " </scr ' + ' ipt> " )
else compat_html_parser . HTMLParser . parse_endtag ( self , i ) )
2012-04-10 18:22:51 -04:00
def get_element_by_id ( id , html ) :
2012-12-19 09:21:14 -05:00
""" Return the content of the tag with the specified ID in the passed HTML document """
return get_element_by_attribute ( " id " , id , html )
def get_element_by_attribute ( attribute , value , html ) :
""" Return the content of the tag with the specified attribute in the passed HTML document """
parser = AttrParser ( attribute , value )
2012-11-27 20:04:46 -05:00
try :
parser . loads ( html )
except compat_html_parser . HTMLParseError :
pass
return parser . get_result ( )
2012-04-10 18:22:51 -04:00
2013-09-13 16:05:29 -04:00
class MetaParser ( BaseHTMLParser ) :
"""
Modified HTMLParser that isolates a meta tag with the specified name
attribute .
"""
def __init__ ( self , name ) :
BaseHTMLParser . __init__ ( self )
self . name = name
self . content = None
self . result = None
def handle_starttag ( self , tag , attrs ) :
if tag != ' meta ' :
return
attrs = dict ( attrs )
if attrs . get ( ' name ' ) == self . name :
self . result = attrs . get ( ' content ' )
def get_result ( self ) :
return self . result
def get_meta_content ( name , html ) :
"""
Return the content attribute from the meta tag with the given name attribute .
"""
parser = MetaParser ( name )
try :
parser . loads ( html )
except compat_html_parser . HTMLParseError :
pass
return parser . get_result ( )
2012-04-10 18:22:51 -04:00
def clean_html ( html ) :
2012-11-27 20:04:46 -05:00
""" Clean an HTML snippet into a readable string """
# Newline vs <br />
html = html . replace ( ' \n ' , ' ' )
2012-12-20 10:30:55 -05:00
html = re . sub ( r ' \ s*< \ s*br \ s*/? \ s*> \ s* ' , ' \n ' , html )
html = re . sub ( r ' < \ s*/ \ s*p \ s*> \ s*< \ s*p[^>]*> ' , ' \n ' , html )
2012-11-27 20:04:46 -05:00
# Strip html tags
html = re . sub ( ' <.*?> ' , ' ' , html )
# Replace html entities
html = unescapeHTML ( html )
2013-03-29 10:59:13 -04:00
return html . strip ( )
2012-04-10 18:22:51 -04:00
2012-03-24 21:07:37 -04:00
def sanitize_open ( filename , open_mode ) :
2012-11-27 20:04:46 -05:00
""" Try to open the given filename, and slightly tweak it if this fails.
Attempts to open the given filename . If this fails , it tries to change
the filename slightly , step by step , until it ' s either able to open it
or it fails and raises a final exception , like the standard open ( )
function .
It returns the tuple ( stream , definitive_file_name ) .
"""
try :
if filename == u ' - ' :
if sys . platform == ' win32 ' :
import msvcrt
msvcrt . setmode ( sys . stdout . fileno ( ) , os . O_BINARY )
2013-03-28 08:13:03 -04:00
return ( sys . stdout . buffer if hasattr ( sys . stdout , ' buffer ' ) else sys . stdout , filename )
2012-11-27 20:04:46 -05:00
stream = open ( encodeFilename ( filename ) , open_mode )
return ( stream , filename )
except ( IOError , OSError ) as err :
2013-05-13 03:20:08 -04:00
if err . errno in ( errno . EACCES , ) :
raise
2012-11-27 20:04:46 -05:00
2013-05-13 03:20:08 -04:00
# In case of error, try to remove win32 forbidden chars
alt_filename = os . path . join (
re . sub ( u ' [/<>: " \\ | \\ \\ ? \\ *] ' , u ' # ' , path_part )
for path_part in os . path . split ( filename )
)
if alt_filename == filename :
raise
else :
# An exception here should be caught in the caller
stream = open ( encodeFilename ( filename ) , open_mode )
return ( stream , alt_filename )
2012-03-24 21:07:37 -04:00
def timeconvert ( timestr ) :
2012-11-27 20:04:46 -05:00
""" Convert RFC 2822 defined time string into system timestamp """
timestamp = None
timetuple = email . utils . parsedate_tz ( timestr )
if timetuple is not None :
timestamp = email . utils . mktime_tz ( timetuple )
return timestamp
2012-11-26 17:58:46 -05:00
2012-12-03 09:36:24 -05:00
def sanitize_filename ( s , restricted = False , is_id = False ) :
2012-11-27 20:04:46 -05:00
""" Sanitizes a string so it could be used as part of a filename.
If restricted is set , use a stricter subset of allowed characters .
2012-12-03 09:36:24 -05:00
Set is_id if this is not an arbitrary string , but an ID that should be kept if possible
2012-11-27 20:04:46 -05:00
"""
def replace_insane ( char ) :
if char == ' ? ' or ord ( char ) < 32 or ord ( char ) == 127 :
return ' '
elif char == ' " ' :
return ' ' if restricted else ' \' '
elif char == ' : ' :
return ' _- ' if restricted else ' - '
elif char in ' \\ /|*<> ' :
return ' _ '
2012-11-28 06:59:27 -05:00
if restricted and ( char in ' !& \' ()[] {} $;`^,# ' or char . isspace ( ) ) :
2012-11-27 20:04:46 -05:00
return ' _ '
if restricted and ord ( char ) > 127 :
return ' _ '
return char
result = u ' ' . join ( map ( replace_insane , s ) )
2012-12-03 09:36:24 -05:00
if not is_id :
while ' __ ' in result :
result = result . replace ( ' __ ' , ' _ ' )
result = result . strip ( ' _ ' )
# Common case of "Foreign band name - English song title"
if restricted and result . startswith ( ' -_ ' ) :
result = result [ 2 : ]
if not result :
result = ' _ '
2012-11-27 20:04:46 -05:00
return result
2012-03-24 21:07:37 -04:00
def orderedSet ( iterable ) :
2012-11-27 20:04:46 -05:00
""" Remove all duplicates from the input iterable """
res = [ ]
for el in iterable :
if el not in res :
res . append ( el )
return res
2012-03-24 21:07:37 -04:00
2014-03-23 20:40:09 -04:00
2014-08-27 13:11:45 -04:00
def _htmlentity_transform ( entity ) :
""" Transforms an HTML entity to a character. """
# Known non-numeric HTML entity
if entity in compat_html_entities . name2codepoint :
return compat_chr ( compat_html_entities . name2codepoint [ entity ] )
mobj = re . match ( r ' #(x?[0-9]+) ' , entity )
if mobj is not None :
numstr = mobj . group ( 1 )
if numstr . startswith ( u ' x ' ) :
base = 16
numstr = u ' 0 %s ' % numstr
else :
base = 10
return compat_chr ( int ( numstr , base ) )
# Unknown entity in name, return its literal representation
return ( u ' & %s ; ' % entity )
2012-03-24 21:07:37 -04:00
def unescapeHTML ( s ) :
2014-03-23 20:40:09 -04:00
if s is None :
return None
assert type ( s ) == compat_str
2012-03-24 21:07:37 -04:00
2014-08-27 13:11:45 -04:00
return re . sub (
r ' &([^;]+); ' , lambda m : _htmlentity_transform ( m . group ( 1 ) ) , s )
2012-03-24 21:07:37 -04:00
2014-01-04 21:07:55 -05:00
def encodeFilename ( s , for_subprocess = False ) :
2012-11-27 20:04:46 -05:00
"""
@param s The name of the file
"""
2012-03-24 21:07:37 -04:00
2014-01-04 21:07:55 -05:00
assert type ( s ) == compat_str
2012-03-24 21:07:37 -04:00
2012-11-27 20:04:46 -05:00
# Python 3 has a Unicode API
if sys . version_info > = ( 3 , 0 ) :
return s
2012-11-27 18:56:20 -05:00
2012-11-27 20:04:46 -05:00
if sys . platform == ' win32 ' and sys . getwindowsversion ( ) [ 0 ] > = 5 :
# Pass u'' directly to use Unicode APIs on Windows 2000 and up
# (Detecting Windows NT 4 is tricky because 'major >= 4' would
# match Windows 9x series as well. Besides, NT 4 is obsolete.)
2014-01-04 21:07:55 -05:00
if not for_subprocess :
return s
else :
# For subprocess calls, encode with locale encoding
# Refer to http://stackoverflow.com/a/9951851/35070
encoding = preferredencoding ( )
2012-11-27 20:04:46 -05:00
else :
2013-01-19 19:48:05 -05:00
encoding = sys . getfilesystemencoding ( )
2014-01-04 21:07:55 -05:00
if encoding is None :
encoding = ' utf-8 '
return s . encode ( encoding , ' ignore ' )
2014-05-16 09:47:54 -04:00
def encodeArgument ( s ) :
if not isinstance ( s , compat_str ) :
# Legacy code that uses byte strings
# Uncomment the following line after fixing all post processors
#assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
s = s . decode ( ' ascii ' )
return encodeFilename ( s , True )
2013-02-21 11:09:39 -05:00
def decodeOption ( optval ) :
if optval is None :
return optval
if isinstance ( optval , bytes ) :
optval = optval . decode ( preferredencoding ( ) )
assert isinstance ( optval , compat_str )
return optval
2013-01-01 14:27:53 -05:00
2013-05-04 06:02:18 -04:00
def formatSeconds ( secs ) :
if secs > 3600 :
return ' %d : %02d : %02d ' % ( secs / / 3600 , ( secs % 3600 ) / / 60 , secs % 60 )
elif secs > 60 :
return ' %d : %02d ' % ( secs / / 60 , secs % 60 )
else :
return ' %d ' % secs
2013-12-29 09:28:32 -05:00
def make_HTTPS_handler ( opts_no_check_certificate , * * kwargs ) :
2013-11-24 00:37:14 -05:00
if sys . version_info < ( 3 , 2 ) :
import httplib
class HTTPSConnectionV3 ( httplib . HTTPSConnection ) :
def __init__ ( self , * args , * * kwargs ) :
httplib . HTTPSConnection . __init__ ( self , * args , * * kwargs )
def connect ( self ) :
sock = socket . create_connection ( ( self . host , self . port ) , self . timeout )
2013-12-08 21:02:54 -05:00
if getattr ( self , ' _tunnel_host ' , False ) :
2013-11-24 00:37:14 -05:00
self . sock = sock
self . _tunnel ( )
try :
2014-09-12 01:50:31 -04:00
self . sock = ssl . wrap_socket ( sock , self . key_file , self . cert_file , ssl_version = ssl . PROTOCOL_TLSv1 )
2013-11-25 00:06:18 -05:00
except ssl . SSLError :
2013-11-24 00:37:14 -05:00
self . sock = ssl . wrap_socket ( sock , self . key_file , self . cert_file , ssl_version = ssl . PROTOCOL_SSLv23 )
class HTTPSHandlerV3 ( compat_urllib_request . HTTPSHandler ) :
def https_open ( self , req ) :
return self . do_open ( HTTPSConnectionV3 , req )
2013-12-29 09:28:32 -05:00
return HTTPSHandlerV3 ( * * kwargs )
2014-09-12 01:50:31 -04:00
elif hasattr ( ssl , ' create_default_context ' ) : # Python >= 3.4
context = ssl . create_default_context ( ssl . Purpose . CLIENT_AUTH )
context . options & = ~ ssl . OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
if opts_no_check_certificate :
context . verify_mode = ssl . CERT_NONE
return compat_urllib_request . HTTPSHandler ( context = context , * * kwargs )
else : # Python < 3.4
context = ssl . SSLContext ( ssl . PROTOCOL_SSLv23 )
2013-05-04 06:19:02 -04:00
context . verify_mode = ( ssl . CERT_NONE
2013-11-22 13:57:52 -05:00
if opts_no_check_certificate
2013-05-04 06:19:02 -04:00
else ssl . CERT_REQUIRED )
2013-12-08 00:54:39 -05:00
context . set_default_verify_paths ( )
try :
context . load_default_certs ( )
except AttributeError :
pass # Python < 3.4
2013-12-29 09:28:32 -05:00
return compat_urllib_request . HTTPSHandler ( context = context , * * kwargs )
2013-05-04 06:19:02 -04:00
2013-01-01 14:27:53 -05:00
class ExtractorError ( Exception ) :
""" Error during info extraction. """
2014-04-21 14:34:03 -04:00
def __init__ ( self , msg , tb = None , expected = False , cause = None , video_id = None ) :
2013-07-02 02:40:21 -04:00
""" tb, if given, is the original traceback (so that it can be printed out).
If expected is set , this is a normal error message and most likely not a bug in youtube - dl .
"""
if sys . exc_info ( ) [ 0 ] in ( compat_urllib_error . URLError , socket . timeout , UnavailableVideoError ) :
expected = True
2014-04-21 14:34:03 -04:00
if video_id is not None :
msg = video_id + ' : ' + msg
2013-07-02 02:40:21 -04:00
if not expected :
2013-08-11 00:46:24 -04:00
msg = msg + u ' ; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update. '
2013-01-01 14:27:53 -05:00
super ( ExtractorError , self ) . __init__ ( msg )
2013-06-09 05:55:08 -04:00
2013-01-01 14:27:53 -05:00
self . traceback = tb
2013-03-09 04:05:43 -05:00
self . exc_info = sys . exc_info ( ) # preserve original exception
2013-08-27 22:25:38 -04:00
self . cause = cause
2014-04-21 14:34:03 -04:00
self . video_id = video_id
2013-01-01 14:27:53 -05:00
2013-01-03 09:39:55 -05:00
def format_traceback ( self ) :
if self . traceback is None :
return None
return u ' ' . join ( traceback . format_tb ( self . traceback ) )
2013-01-01 14:27:53 -05:00
2013-10-23 08:38:03 -04:00
class RegexNotFoundError ( ExtractorError ) :
""" Error when a regex didn ' t match """
pass
2012-03-24 21:07:37 -04:00
class DownloadError ( Exception ) :
2012-11-27 20:04:46 -05:00
""" Download Error exception.
2012-03-24 21:07:37 -04:00
2012-11-27 20:04:46 -05:00
This exception may be thrown by FileDownloader objects if they are not
configured to continue on errors . They will contain the appropriate
error message .
"""
2013-03-09 04:05:43 -05:00
def __init__ ( self , msg , exc_info = None ) :
""" exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
super ( DownloadError , self ) . __init__ ( msg )
self . exc_info = exc_info
2012-03-24 21:07:37 -04:00
class SameFileError ( Exception ) :
2012-11-27 20:04:46 -05:00
""" Same File exception.
2012-03-24 21:07:37 -04:00
2012-11-27 20:04:46 -05:00
This exception will be thrown by FileDownloader objects if they detect
multiple files would have to be downloaded to the same file on disk .
"""
pass
2012-03-24 21:07:37 -04:00
class PostProcessingError ( Exception ) :
2012-11-27 20:04:46 -05:00
""" Post Processing exception.
2012-03-24 21:07:37 -04:00
2012-11-27 20:04:46 -05:00
This exception may be raised by PostProcessor ' s .run() method to
indicate an error in the postprocessing task .
"""
2013-01-12 09:07:59 -05:00
def __init__ ( self , msg ) :
self . msg = msg
2012-03-24 21:07:37 -04:00
class MaxDownloadsReached ( Exception ) :
2012-11-27 20:04:46 -05:00
""" --max-downloads limit has been reached. """
pass
2012-03-24 21:07:37 -04:00
class UnavailableVideoError ( Exception ) :
2012-11-27 20:04:46 -05:00
""" Unavailable Format exception.
2012-03-24 21:07:37 -04:00
2012-11-27 20:04:46 -05:00
This exception will be thrown when a video is requested
in a format that is not available for that video .
"""
pass
2012-03-24 21:07:37 -04:00
class ContentTooShortError ( Exception ) :
2012-11-27 20:04:46 -05:00
""" Content Too Short exception.
2012-03-24 21:07:37 -04:00
2012-11-27 20:04:46 -05:00
This exception may be raised by FileDownloader objects when a file they
download is too small for what the server announced first , indicating
the connection was probably interrupted .
"""
# Both in bytes
downloaded = None
expected = None
2012-03-24 21:07:37 -04:00
2012-11-27 20:04:46 -05:00
def __init__ ( self , downloaded , expected ) :
self . downloaded = downloaded
self . expected = expected
2012-03-24 21:07:37 -04:00
2013-08-27 17:15:01 -04:00
class YoutubeDLHandler ( compat_urllib_request . HTTPHandler ) :
2012-11-27 20:04:46 -05:00
""" Handler for HTTP requests and responses.
This class , when installed with an OpenerDirector , automatically adds
the standard headers to every HTTP request and handles gzipped and
deflated responses from web servers . If compression is to be avoided in
a particular request , the original request in the program code only has
to include the HTTP header " Youtubedl-No-Compression " , which will be
removed before making the real request .
Part of this code was copied from :
http : / / techknack . net / python - urllib2 - handlers /
Andrew Rowls , the author of that code , agreed to release it to the
public domain .
"""
@staticmethod
def deflate ( data ) :
try :
return zlib . decompress ( data , - zlib . MAX_WBITS )
except zlib . error :
return zlib . decompress ( data )
@staticmethod
def addinfourl_wrapper ( stream , headers , url , code ) :
if hasattr ( compat_urllib_request . addinfourl , ' getcode ' ) :
return compat_urllib_request . addinfourl ( stream , headers , url , code )
ret = compat_urllib_request . addinfourl ( stream , headers , url )
ret . code = code
return ret
2013-08-27 17:15:01 -04:00
def http_request ( self , req ) :
2014-08-26 05:51:48 -04:00
for h , v in std_headers . items ( ) :
if h not in req . headers :
req . add_header ( h , v )
2012-11-27 20:04:46 -05:00
if ' Youtubedl-no-compression ' in req . headers :
if ' Accept-encoding ' in req . headers :
del req . headers [ ' Accept-encoding ' ]
del req . headers [ ' Youtubedl-no-compression ' ]
2013-01-12 10:49:13 -05:00
if ' Youtubedl-user-agent ' in req . headers :
2013-01-12 12:38:23 -05:00
if ' User-agent ' in req . headers :
del req . headers [ ' User-agent ' ]
req . headers [ ' User-agent ' ] = req . headers [ ' Youtubedl-user-agent ' ]
2013-01-12 10:49:13 -05:00
del req . headers [ ' Youtubedl-user-agent ' ]
2012-11-27 20:04:46 -05:00
return req
2013-08-27 17:15:01 -04:00
def http_response ( self , req , resp ) :
2012-11-27 20:04:46 -05:00
old_resp = resp
# gzip
if resp . headers . get ( ' Content-encoding ' , ' ' ) == ' gzip ' :
2013-08-28 05:57:13 -04:00
content = resp . read ( )
gz = gzip . GzipFile ( fileobj = io . BytesIO ( content ) , mode = ' rb ' )
try :
uncompressed = io . BytesIO ( gz . read ( ) )
except IOError as original_ioerror :
# There may be junk add the end of the file
# See http://stackoverflow.com/q/4928560/35070 for details
for i in range ( 1 , 1024 ) :
try :
gz = gzip . GzipFile ( fileobj = io . BytesIO ( content [ : - i ] ) , mode = ' rb ' )
uncompressed = io . BytesIO ( gz . read ( ) )
except IOError :
continue
break
else :
raise original_ioerror
resp = self . addinfourl_wrapper ( uncompressed , old_resp . headers , old_resp . url , old_resp . code )
2012-11-27 20:04:46 -05:00
resp . msg = old_resp . msg
# deflate
if resp . headers . get ( ' Content-encoding ' , ' ' ) == ' deflate ' :
gz = io . BytesIO ( self . deflate ( resp . read ( ) ) )
resp = self . addinfourl_wrapper ( gz , old_resp . headers , old_resp . url , old_resp . code )
resp . msg = old_resp . msg
return resp
2012-12-06 18:39:44 -05:00
2013-08-27 17:15:01 -04:00
https_request = http_request
https_response = http_response
2013-04-27 09:14:20 -04:00
2014-02-06 05:29:46 -05:00
2014-05-17 13:04:02 -04:00
def parse_iso8601 ( date_str , delimiter = ' T ' ) :
2014-03-23 20:40:09 -04:00
""" Return a UNIX timestamp from the given date """
if date_str is None :
return None
m = re . search (
r ' Z$| ?(?P<sign> \ +|-)(?P<hours>[0-9] {2} ):?(?P<minutes>[0-9] {2} )$ ' ,
date_str )
if not m :
timezone = datetime . timedelta ( )
else :
date_str = date_str [ : - len ( m . group ( 0 ) ) ]
if not m . group ( ' sign ' ) :
timezone = datetime . timedelta ( )
else :
sign = 1 if m . group ( ' sign ' ) == ' + ' else - 1
timezone = datetime . timedelta (
hours = sign * int ( m . group ( ' hours ' ) ) ,
minutes = sign * int ( m . group ( ' minutes ' ) ) )
2014-05-17 13:04:02 -04:00
date_format = ' % Y- % m- %d {0} % H: % M: % S ' . format ( delimiter )
dt = datetime . datetime . strptime ( date_str , date_format ) - timezone
2014-03-23 20:40:09 -04:00
return calendar . timegm ( dt . timetuple ( ) )
2013-04-27 09:14:20 -04:00
def unified_strdate ( date_str ) :
""" Return a string with the date in the format YYYYMMDD """
2014-03-21 09:38:37 -04:00
if date_str is None :
return None
2013-04-27 09:14:20 -04:00
upload_date = None
#Replace commas
2014-02-09 12:09:57 -05:00
date_str = date_str . replace ( ' , ' , ' ' )
2013-04-27 09:14:20 -04:00
# %z (UTC offset) is only supported in python>=3.2
2014-02-09 12:09:57 -05:00
date_str = re . sub ( r ' ?( \ +|-)[0-9] {2} :?[0-9] {2} $ ' , ' ' , date_str )
2013-09-14 08:26:42 -04:00
format_expressions = [
' %d % B % Y ' ,
2014-02-16 15:47:03 -05:00
' %d % b % Y ' ,
2013-09-14 08:26:42 -04:00
' % B %d % Y ' ,
' % b %d % Y ' ,
2014-06-28 14:02:02 -04:00
' % b %d st % Y % I: % M % p ' ,
' % b %d nd % Y % I: % M % p ' ,
' % b %d th % Y % I: % M % p ' ,
2013-09-14 08:26:42 -04:00
' % Y- % m- %d ' ,
2014-08-19 09:02:08 -04:00
' % Y/ % m/ %d ' ,
2014-02-27 05:44:05 -05:00
' %d . % m. % Y ' ,
2013-09-14 08:26:42 -04:00
' %d / % m/ % Y ' ,
2014-08-24 00:41:55 -04:00
' %d / % m/ % y ' ,
2013-09-14 08:26:42 -04:00
' % Y/ % m/ %d % H: % M: % S ' ,
2014-01-06 11:15:27 -05:00
' % Y- % m- %d % H: % M: % S ' ,
2013-09-14 08:26:42 -04:00
' %d . % m. % Y % H: % M ' ,
2014-03-11 17:18:43 -04:00
' %d . % m. % Y % H. % M ' ,
2013-09-14 08:26:42 -04:00
' % Y- % m- %d T % H: % M: % SZ ' ,
2013-11-20 00:13:19 -05:00
' % Y- % m- %d T % H: % M: % S. %f Z ' ,
' % Y- % m- %d T % H: % M: % S. %f 0Z ' ,
2013-10-10 09:25:11 -04:00
' % Y- % m- %d T % H: % M: % S ' ,
2014-02-23 07:00:51 -05:00
' % Y- % m- %d T % H: % M: % S. %f ' ,
2014-02-06 05:29:46 -05:00
' % Y- % m- %d T % H: % M ' ,
2013-09-14 08:26:42 -04:00
]
2013-04-27 09:14:20 -04:00
for expression in format_expressions :
try :
upload_date = datetime . datetime . strptime ( date_str , expression ) . strftime ( ' % Y % m %d ' )
2014-02-06 05:29:46 -05:00
except ValueError :
2013-04-27 09:14:20 -04:00
pass
2013-12-17 06:33:55 -05:00
if upload_date is None :
timetuple = email . utils . parsedate_tz ( date_str )
if timetuple :
upload_date = datetime . datetime ( * timetuple [ : 6 ] ) . strftime ( ' % Y % m %d ' )
2013-04-27 09:14:20 -04:00
return upload_date
2013-07-12 15:52:59 -04:00
def determine_ext ( url , default_ext = u ' unknown_video ' ) :
2014-08-01 08:08:09 -04:00
if url is None :
return default_ext
2013-07-07 19:13:55 -04:00
guess = url . partition ( u ' ? ' ) [ 0 ] . rpartition ( u ' . ' ) [ 2 ]
if re . match ( r ' ^[A-Za-z0-9]+$ ' , guess ) :
return guess
else :
2013-07-12 15:52:59 -04:00
return default_ext
2013-07-07 19:13:55 -04:00
2013-07-20 06:48:57 -04:00
def subtitles_filename ( filename , sub_lang , sub_format ) :
return filename . rsplit ( ' . ' , 1 ) [ 0 ] + u ' . ' + sub_lang + u ' . ' + sub_format
2013-04-27 08:01:55 -04:00
def date_from_str ( date_str ) :
2013-04-28 05:39:37 -04:00
"""
Return a datetime object from a string in the format YYYYMMDD or
( now | today ) [ + - ] [ 0 - 9 ] ( day | week | month | year ) ( s ) ? """
today = datetime . date . today ( )
if date_str == ' now ' or date_str == ' today ' :
return today
match = re . match ( ' (now|today)(?P<sign>[+-])(?P<time> \ d+)(?P<unit>day|week|month|year)(s)? ' , date_str )
if match is not None :
sign = match . group ( ' sign ' )
time = int ( match . group ( ' time ' ) )
if sign == ' - ' :
time = - time
unit = match . group ( ' unit ' )
#A bad aproximation?
if unit == ' month ' :
unit = ' day '
time * = 30
elif unit == ' year ' :
unit = ' day '
time * = 365
unit + = ' s '
delta = datetime . timedelta ( * * { unit : time } )
return today + delta
2013-04-27 08:01:55 -04:00
return datetime . datetime . strptime ( date_str , " % Y % m %d " ) . date ( )
2014-01-02 07:47:28 -05:00
def hyphenate_date ( date_str ) :
"""
Convert a date in ' YYYYMMDD ' format to ' YYYY-MM-DD ' format """
match = re . match ( r ' ^( \ d \ d \ d \ d)( \ d \ d)( \ d \ d)$ ' , date_str )
if match is not None :
return ' - ' . join ( match . groups ( ) )
else :
return date_str
2013-04-27 08:01:55 -04:00
class DateRange ( object ) :
""" Represents a time interval between two dates """
def __init__ ( self , start = None , end = None ) :
""" start and end must be strings in the format accepted by date """
if start is not None :
self . start = date_from_str ( start )
else :
self . start = datetime . datetime . min . date ( )
if end is not None :
self . end = date_from_str ( end )
else :
self . end = datetime . datetime . max . date ( )
2013-04-28 05:39:37 -04:00
if self . start > self . end :
2013-04-27 08:01:55 -04:00
raise ValueError ( ' Date range: " %s " , the start date must be before the end date ' % self )
@classmethod
def day ( cls , day ) :
""" Returns a range that only contains the given day """
return cls ( day , day )
def __contains__ ( self , date ) :
""" Check if the date is in the range """
2013-04-28 05:39:37 -04:00
if not isinstance ( date , datetime . date ) :
date = date_from_str ( date )
return self . start < = date < = self . end
2013-04-27 08:01:55 -04:00
def __str__ ( self ) :
return ' %s - %s ' % ( self . start . isoformat ( ) , self . end . isoformat ( ) )
2013-08-28 06:57:10 -04:00
def platform_name ( ) :
""" Returns the platform name as a compat_str """
res = platform . platform ( )
if isinstance ( res , bytes ) :
res = res . decode ( preferredencoding ( ) )
assert isinstance ( res , compat_str )
return res
2013-08-28 12:22:28 -04:00
2014-04-07 16:48:13 -04:00
def _windows_write_string ( s , out ) :
""" Returns True if the string was written using special methods,
False if it has yet to be written out . """
# Adapted from http://stackoverflow.com/a/3259271/35070
import ctypes
import ctypes . wintypes
WIN_OUTPUT_IDS = {
1 : - 11 ,
2 : - 12 ,
}
2014-04-30 04:07:32 -04:00
try :
fileno = out . fileno ( )
except AttributeError :
# If the output stream doesn't have a fileno, it's virtual
return False
2014-04-07 16:48:13 -04:00
if fileno not in WIN_OUTPUT_IDS :
return False
GetStdHandle = ctypes . WINFUNCTYPE (
ctypes . wintypes . HANDLE , ctypes . wintypes . DWORD ) (
( " GetStdHandle " , ctypes . windll . kernel32 ) )
h = GetStdHandle ( WIN_OUTPUT_IDS [ fileno ] )
WriteConsoleW = ctypes . WINFUNCTYPE (
ctypes . wintypes . BOOL , ctypes . wintypes . HANDLE , ctypes . wintypes . LPWSTR ,
ctypes . wintypes . DWORD , ctypes . POINTER ( ctypes . wintypes . DWORD ) ,
ctypes . wintypes . LPVOID ) ( ( " WriteConsoleW " , ctypes . windll . kernel32 ) )
written = ctypes . wintypes . DWORD ( 0 )
GetFileType = ctypes . WINFUNCTYPE ( ctypes . wintypes . DWORD , ctypes . wintypes . DWORD ) ( ( " GetFileType " , ctypes . windll . kernel32 ) )
FILE_TYPE_CHAR = 0x0002
FILE_TYPE_REMOTE = 0x8000
GetConsoleMode = ctypes . WINFUNCTYPE (
ctypes . wintypes . BOOL , ctypes . wintypes . HANDLE ,
ctypes . POINTER ( ctypes . wintypes . DWORD ) ) (
( " GetConsoleMode " , ctypes . windll . kernel32 ) )
INVALID_HANDLE_VALUE = ctypes . wintypes . DWORD ( - 1 ) . value
def not_a_console ( handle ) :
if handle == INVALID_HANDLE_VALUE or handle is None :
return True
return ( ( GetFileType ( handle ) & ~ FILE_TYPE_REMOTE ) != FILE_TYPE_CHAR
or GetConsoleMode ( handle , ctypes . byref ( ctypes . wintypes . DWORD ( ) ) ) == 0 )
if not_a_console ( h ) :
return False
2014-04-20 22:59:44 -04:00
def next_nonbmp_pos ( s ) :
try :
return next ( i for i , c in enumerate ( s ) if ord ( c ) > 0xffff )
except StopIteration :
return len ( s )
while s :
count = min ( next_nonbmp_pos ( s ) , 1024 )
2014-04-07 16:48:13 -04:00
ret = WriteConsoleW (
2014-04-20 22:59:44 -04:00
h , s , count if count else 2 , ctypes . byref ( written ) , None )
2014-04-07 16:48:13 -04:00
if ret == 0 :
raise OSError ( ' Failed to write string ' )
2014-04-20 22:59:44 -04:00
if not count : # We just wrote a non-BMP character
assert written . value == 2
s = s [ 1 : ]
else :
assert written . value > 0
s = s [ written . value : ]
2014-04-07 16:48:13 -04:00
return True
2014-04-07 13:57:42 -04:00
def write_string ( s , out = None , encoding = None ) :
2013-09-16 00:55:33 -04:00
if out is None :
out = sys . stderr
2014-01-04 21:07:55 -05:00
assert type ( s ) == compat_str
2013-09-16 00:55:33 -04:00
2014-04-07 16:48:13 -04:00
if sys . platform == ' win32 ' and encoding is None and hasattr ( out , ' fileno ' ) :
if _windows_write_string ( s , out ) :
return
2013-09-16 00:55:33 -04:00
if ( ' b ' in getattr ( out , ' mode ' , ' ' ) or
sys . version_info [ 0 ] < 3 ) : # Python 2 lies about mode of sys.stderr
2014-04-07 15:40:34 -04:00
byt = s . encode ( encoding or preferredencoding ( ) , ' ignore ' )
out . write ( byt )
elif hasattr ( out , ' buffer ' ) :
enc = encoding or getattr ( out , ' encoding ' , None ) or preferredencoding ( )
byt = s . encode ( enc , ' ignore ' )
out . buffer . write ( byt )
else :
2014-01-04 21:07:55 -05:00
out . write ( s )
2013-09-16 00:55:33 -04:00
out . flush ( )
2013-08-28 08:28:55 -04:00
def bytes_to_intlist ( bs ) :
if not bs :
return [ ]
if isinstance ( bs [ 0 ] , int ) : # Python 3
return list ( bs )
else :
return [ ord ( c ) for c in bs ]
2013-08-28 12:22:28 -04:00
2013-08-28 09:59:07 -04:00
def intlist_to_bytes ( xs ) :
if not xs :
return b ' '
if isinstance ( chr ( 0 ) , bytes ) : # Python 2
return ' ' . join ( [ chr ( x ) for x in xs ] )
else :
return bytes ( xs )
2013-10-02 02:41:03 -04:00
2013-10-05 22:27:09 -04:00
# Cross-platform file locking
if sys . platform == ' win32 ' :
import ctypes . wintypes
import msvcrt
class OVERLAPPED ( ctypes . Structure ) :
_fields_ = [
( ' Internal ' , ctypes . wintypes . LPVOID ) ,
( ' InternalHigh ' , ctypes . wintypes . LPVOID ) ,
( ' Offset ' , ctypes . wintypes . DWORD ) ,
( ' OffsetHigh ' , ctypes . wintypes . DWORD ) ,
( ' hEvent ' , ctypes . wintypes . HANDLE ) ,
]
kernel32 = ctypes . windll . kernel32
LockFileEx = kernel32 . LockFileEx
LockFileEx . argtypes = [
ctypes . wintypes . HANDLE , # hFile
ctypes . wintypes . DWORD , # dwFlags
ctypes . wintypes . DWORD , # dwReserved
ctypes . wintypes . DWORD , # nNumberOfBytesToLockLow
ctypes . wintypes . DWORD , # nNumberOfBytesToLockHigh
ctypes . POINTER ( OVERLAPPED ) # Overlapped
]
LockFileEx . restype = ctypes . wintypes . BOOL
UnlockFileEx = kernel32 . UnlockFileEx
UnlockFileEx . argtypes = [
ctypes . wintypes . HANDLE , # hFile
ctypes . wintypes . DWORD , # dwReserved
ctypes . wintypes . DWORD , # nNumberOfBytesToLockLow
ctypes . wintypes . DWORD , # nNumberOfBytesToLockHigh
ctypes . POINTER ( OVERLAPPED ) # Overlapped
]
UnlockFileEx . restype = ctypes . wintypes . BOOL
whole_low = 0xffffffff
whole_high = 0x7fffffff
def _lock_file ( f , exclusive ) :
overlapped = OVERLAPPED ( )
overlapped . Offset = 0
overlapped . OffsetHigh = 0
overlapped . hEvent = 0
f . _lock_file_overlapped_p = ctypes . pointer ( overlapped )
handle = msvcrt . get_osfhandle ( f . fileno ( ) )
if not LockFileEx ( handle , 0x2 if exclusive else 0x0 , 0 ,
whole_low , whole_high , f . _lock_file_overlapped_p ) :
raise OSError ( ' Locking file failed: %r ' % ctypes . FormatError ( ) )
def _unlock_file ( f ) :
assert f . _lock_file_overlapped_p
handle = msvcrt . get_osfhandle ( f . fileno ( ) )
if not UnlockFileEx ( handle , 0 ,
whole_low , whole_high , f . _lock_file_overlapped_p ) :
raise OSError ( ' Unlocking file failed: %r ' % ctypes . FormatError ( ) )
else :
import fcntl
def _lock_file ( f , exclusive ) :
2014-08-31 19:41:25 -04:00
fcntl . flock ( f , fcntl . LOCK_EX if exclusive else fcntl . LOCK_SH )
2013-10-05 22:27:09 -04:00
def _unlock_file ( f ) :
2014-08-31 19:41:25 -04:00
fcntl . flock ( f , fcntl . LOCK_UN )
2013-10-05 22:27:09 -04:00
class locked_file ( object ) :
def __init__ ( self , filename , mode , encoding = None ) :
assert mode in [ ' r ' , ' a ' , ' w ' ]
self . f = io . open ( filename , mode , encoding = encoding )
self . mode = mode
def __enter__ ( self ) :
exclusive = self . mode != ' r '
try :
_lock_file ( self . f , exclusive )
except IOError :
self . f . close ( )
raise
return self
def __exit__ ( self , etype , value , traceback ) :
try :
_unlock_file ( self . f )
finally :
self . f . close ( )
def __iter__ ( self ) :
return iter ( self . f )
def write ( self , * args ) :
return self . f . write ( * args )
def read ( self , * args ) :
return self . f . read ( * args )
2013-10-12 07:49:27 -04:00
def shell_quote ( args ) :
2013-11-21 08:09:28 -05:00
quoted_args = [ ]
encoding = sys . getfilesystemencoding ( )
if encoding is None :
encoding = ' utf-8 '
for a in args :
if isinstance ( a , bytes ) :
# We may get a filename encoded with 'encodeFilename'
a = a . decode ( encoding )
quoted_args . append ( pipes . quote ( a ) )
return u ' ' . join ( quoted_args )
2013-10-15 06:05:13 -04:00
2013-10-17 18:46:35 -04:00
def takewhile_inclusive ( pred , seq ) :
""" Like itertools.takewhile, but include the latest evaluated element
( the first element so that Not pred ( e ) ) """
for e in seq :
yield e
if not pred ( e ) :
return
2013-10-15 06:05:13 -04:00
def smuggle_url ( url , data ) :
""" Pass additional data in a URL for internal use. """
sdata = compat_urllib_parse . urlencode (
{ u ' __youtubedl_smuggle ' : json . dumps ( data ) } )
return url + u ' # ' + sdata
2014-01-06 23:34:14 -05:00
def unsmuggle_url ( smug_url , default = None ) :
2013-10-15 06:05:13 -04:00
if not ' #__youtubedl_smuggle ' in smug_url :
2014-01-06 23:34:14 -05:00
return smug_url , default
2013-10-15 06:05:13 -04:00
url , _ , sdata = smug_url . rpartition ( u ' # ' )
jsond = compat_parse_qs ( sdata ) [ u ' __youtubedl_smuggle ' ] [ 0 ]
data = json . loads ( jsond )
return url , data
2013-11-24 21:12:26 -05:00
def format_bytes ( bytes ) :
if bytes is None :
return u ' N/A '
if type ( bytes ) is str :
bytes = float ( bytes )
if bytes == 0.0 :
exponent = 0
else :
exponent = int ( math . log ( bytes , 1024.0 ) )
suffix = [ u ' B ' , u ' KiB ' , u ' MiB ' , u ' GiB ' , u ' TiB ' , u ' PiB ' , u ' EiB ' , u ' ZiB ' , u ' YiB ' ] [ exponent ]
converted = float ( bytes ) / float ( 1024 * * exponent )
return u ' %.2f %s ' % ( converted , suffix )
2013-12-06 07:36:36 -05:00
2013-12-09 12:29:07 -05:00
def get_term_width ( ) :
columns = os . environ . get ( ' COLUMNS ' , None )
if columns :
return int ( columns )
try :
sp = subprocess . Popen (
[ ' stty ' , ' size ' ] ,
stdout = subprocess . PIPE , stderr = subprocess . PIPE )
out , err = sp . communicate ( )
return int ( out . split ( ) [ 1 ] )
except :
pass
return None
2013-12-09 13:39:41 -05:00
def month_by_name ( name ) :
""" Return the number of a month by (locale-independently) English name """
ENGLISH_NAMES = [
2013-12-13 10:27:37 -05:00
u ' January ' , u ' February ' , u ' March ' , u ' April ' , u ' May ' , u ' June ' ,
2013-12-09 13:39:41 -05:00
u ' July ' , u ' August ' , u ' September ' , u ' October ' , u ' November ' , u ' December ' ]
try :
return ENGLISH_NAMES . index ( name ) + 1
except ValueError :
return None
2013-12-10 15:03:53 -05:00
2014-01-20 16:11:34 -05:00
def fix_xml_ampersands ( xml_str ) :
2013-12-10 15:03:53 -05:00
""" Replace all the ' & ' by ' & ' in XML """
2014-01-20 16:11:34 -05:00
return re . sub (
r ' &(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F] { ,4};|#[0-9] { ,4};) ' ,
u ' & ' ,
xml_str )
2013-12-15 23:04:12 -05:00
def setproctitle ( title ) :
2014-01-04 21:07:55 -05:00
assert isinstance ( title , compat_str )
2013-12-15 23:04:12 -05:00
try :
libc = ctypes . cdll . LoadLibrary ( " libc.so.6 " )
except OSError :
return
2014-03-23 09:28:22 -04:00
title_bytes = title . encode ( ' utf-8 ' )
buf = ctypes . create_string_buffer ( len ( title_bytes ) )
buf . value = title_bytes
2013-12-15 23:04:12 -05:00
try :
2014-03-23 09:28:22 -04:00
libc . prctl ( 15 , buf , 0 , 0 , 0 )
2013-12-15 23:04:12 -05:00
except AttributeError :
return # Strange libc, just skip this
2013-12-16 07:56:13 -05:00
def remove_start ( s , start ) :
if s . startswith ( start ) :
return s [ len ( start ) : ]
return s
2013-12-16 22:13:36 -05:00
2014-08-22 12:40:26 -04:00
def remove_end ( s , end ) :
if s . endswith ( end ) :
return s [ : - len ( end ) ]
return s
2013-12-16 22:13:36 -05:00
def url_basename ( url ) :
2013-12-17 08:56:29 -05:00
path = compat_urlparse . urlparse ( url ) . path
return path . strip ( u ' / ' ) . split ( u ' / ' ) [ - 1 ]
2013-12-20 11:05:28 -05:00
class HEADRequest ( compat_urllib_request . Request ) :
def get_method ( self ) :
return " HEAD "
2013-12-25 09:18:40 -05:00
2014-07-21 06:02:44 -04:00
def int_or_none ( v , scale = 1 , default = None , get_attr = None , invscale = 1 ) :
2014-04-21 07:45:27 -04:00
if get_attr :
if v is not None :
v = getattr ( v , get_attr , None )
2014-08-10 07:04:45 -04:00
if v == ' ' :
v = None
2014-07-21 06:02:44 -04:00
return default if v is None else ( int ( v ) * invscale / / scale )
2014-08-10 07:04:45 -04:00
2014-08-10 05:00:14 -04:00
def str_or_none ( v , default = None ) :
return default if v is None else compat_str ( v )
2014-07-21 06:02:44 -04:00
def str_to_int ( int_str ) :
2014-08-31 17:51:36 -04:00
""" A more relaxed version of int_or_none """
2014-07-21 06:02:44 -04:00
if int_str is None :
return None
2014-09-03 08:59:36 -04:00
int_str = re . sub ( r ' [, \ . \ +] ' , u ' ' , int_str )
2014-07-21 06:02:44 -04:00
return int ( int_str )
2013-12-26 07:49:44 -05:00
2014-07-21 06:02:44 -04:00
def float_or_none ( v , scale = 1 , invscale = 1 , default = None ) :
return default if v is None else ( float ( v ) * invscale / scale )
2014-03-28 18:06:34 -04:00
2013-12-26 07:49:44 -05:00
def parse_duration ( s ) :
if s is None :
return None
2014-08-30 19:41:30 -04:00
s = s . strip ( )
2013-12-26 07:49:44 -05:00
m = re . match (
2014-09-03 10:03:36 -04:00
r ' (?i)(?:(?:(?P<hours>[0-9]+) \ s*(?:[:h]|hours?) \ s*)?(?P<mins>[0-9]+) \ s*(?:[:m]|mins?|minutes?) \ s*)?(?P<secs>[0-9]+)(?P<ms> \ .[0-9]+)? \ s*(?:s|secs?|seconds?)?$ ' , s )
2013-12-26 07:49:44 -05:00
if not m :
return None
res = int ( m . group ( ' secs ' ) )
if m . group ( ' mins ' ) :
res + = int ( m . group ( ' mins ' ) ) * 60
if m . group ( ' hours ' ) :
res + = int ( m . group ( ' hours ' ) ) * 60 * 60
2014-08-25 06:59:53 -04:00
if m . group ( ' ms ' ) :
res + = float ( m . group ( ' ms ' ) )
2013-12-26 07:49:44 -05:00
return res
2014-01-03 06:52:27 -05:00
def prepend_extension ( filename , ext ) :
name , real_ext = os . path . splitext ( filename )
return u ' {0} . {1} {2} ' . format ( name , ext , real_ext )
2014-01-07 00:23:41 -05:00
def check_executable ( exe , args = [ ] ) :
""" Checks if the given binary is installed somewhere in PATH, and returns its name.
args can be a list of arguments for a short output ( like - version ) """
try :
subprocess . Popen ( [ exe ] + args , stdout = subprocess . PIPE , stderr = subprocess . PIPE ) . communicate ( )
except OSError :
return False
return exe
2014-01-20 05:36:47 -05:00
class PagedList ( object ) :
def __init__ ( self , pagefunc , pagesize ) :
self . _pagefunc = pagefunc
self . _pagesize = pagesize
2014-01-22 15:43:33 -05:00
def __len__ ( self ) :
# This is only useful for tests
return len ( self . getslice ( ) )
2014-01-20 05:36:47 -05:00
def getslice ( self , start = 0 , end = None ) :
res = [ ]
for pagenum in itertools . count ( start / / self . _pagesize ) :
firstid = pagenum * self . _pagesize
nextfirstid = pagenum * self . _pagesize + self . _pagesize
if start > = nextfirstid :
continue
page_results = list ( self . _pagefunc ( pagenum ) )
startv = (
start % self . _pagesize
if firstid < = start < nextfirstid
else 0 )
endv = (
( ( end - 1 ) % self . _pagesize ) + 1
if ( end is not None and firstid < = end < = nextfirstid )
else None )
if startv != 0 or endv is not None :
page_results = page_results [ startv : endv ]
res . extend ( page_results )
# A little optimization - if current page is not "full", ie. does
# not contain page_size videos then we can assume that this page
# is the last one - there are no more ids on further pages -
# i.e. no need to query again.
if len ( page_results ) + startv < self . _pagesize :
break
# If we got the whole page, but the next page is not interesting,
# break out early as well
if end == nextfirstid :
break
return res
2014-02-09 11:56:10 -05:00
def uppercase_escape ( s ) :
2014-04-04 17:00:51 -04:00
unicode_escape = codecs . getdecoder ( ' unicode_escape ' )
2014-02-09 11:56:10 -05:00
return re . sub (
2014-04-01 07:17:07 -04:00
r ' \\ U[0-9a-fA-F] {8} ' ,
2014-04-04 17:00:51 -04:00
lambda m : unicode_escape ( m . group ( 0 ) ) [ 0 ] ,
s )
2014-02-15 10:24:43 -05:00
try :
struct . pack ( u ' !I ' , 0 )
except TypeError :
# In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
def struct_pack ( spec , * args ) :
if isinstance ( spec , compat_str ) :
spec = spec . encode ( ' ascii ' )
return struct . pack ( spec , * args )
def struct_unpack ( spec , * args ) :
if isinstance ( spec , compat_str ) :
spec = spec . encode ( ' ascii ' )
return struct . unpack ( spec , * args )
else :
struct_pack = struct . pack
struct_unpack = struct . unpack
2014-02-24 19:43:17 -05:00
def read_batch_urls ( batch_fd ) :
def fixup ( url ) :
if not isinstance ( url , compat_str ) :
url = url . decode ( ' utf-8 ' , ' replace ' )
BOM_UTF8 = u ' \xef \xbb \xbf '
if url . startswith ( BOM_UTF8 ) :
url = url [ len ( BOM_UTF8 ) : ]
url = url . strip ( )
if url . startswith ( ( ' # ' , ' ; ' , ' ] ' ) ) :
return False
return url
with contextlib . closing ( batch_fd ) as fd :
return [ url for url in map ( fixup , fd ) if url ]
2014-03-07 09:25:33 -05:00
def urlencode_postdata ( * args , * * kargs ) :
return compat_urllib_parse . urlencode ( * args , * * kargs ) . encode ( ' ascii ' )
2014-03-10 12:31:32 -04:00
2014-08-25 12:03:01 -04:00
try :
etree_iter = xml . etree . ElementTree . Element . iter
except AttributeError : # Python <=2.6
etree_iter = lambda n : n . findall ( ' .//* ' )
2014-03-10 12:31:32 -04:00
def parse_xml ( s ) :
class TreeBuilder ( xml . etree . ElementTree . TreeBuilder ) :
def doctype ( self , name , pubid , system ) :
pass # Ignore doctypes
parser = xml . etree . ElementTree . XMLParser ( target = TreeBuilder ( ) )
kwargs = { ' parser ' : parser } if sys . version_info > = ( 2 , 7 ) else { }
2014-08-25 12:03:01 -04:00
tree = xml . etree . ElementTree . XML ( s . encode ( ' utf-8 ' ) , * * kwargs )
# Fix up XML parser in Python 2.x
if sys . version_info < ( 3 , 0 ) :
for n in etree_iter ( tree ) :
if n . text is not None :
if not isinstance ( n . text , compat_str ) :
n . text = n . text . decode ( ' utf-8 ' )
return tree
2014-03-18 09:27:42 -04:00
if sys . version_info < ( 3 , 0 ) and sys . platform == ' win32 ' :
def compat_getpass ( prompt , * args , * * kwargs ) :
if isinstance ( prompt , compat_str ) :
2014-03-18 09:28:53 -04:00
prompt = prompt . encode ( preferredencoding ( ) )
2014-03-18 09:27:42 -04:00
return getpass . getpass ( prompt , * args , * * kwargs )
else :
compat_getpass = getpass . getpass
2014-03-20 19:59:51 -04:00
US_RATINGS = {
' G ' : 0 ,
' PG ' : 10 ,
' PG-13 ' : 13 ,
' R ' : 16 ,
' NC ' : 18 ,
}
2014-03-24 18:21:20 -04:00
def strip_jsonp ( code ) :
2014-07-13 18:41:23 -04:00
return re . sub ( r ' (?s)^[a-zA-Z0-9_]+ \ s* \ ( \ s*(.*) \ );? \ s*? \ s*$ ' , r ' \ 1 ' , code )
2014-04-21 01:12:02 -04:00
2014-08-21 20:33:29 -04:00
def js_to_json ( code ) :
def fix_kv ( m ) :
key = m . group ( 2 )
if key . startswith ( " ' " ) :
assert key . endswith ( " ' " )
assert ' " ' not in key
key = ' " %s " ' % key [ 1 : - 1 ]
elif not key . startswith ( ' " ' ) :
key = ' " %s " ' % key
value = m . group ( 4 )
if value . startswith ( " ' " ) :
assert value . endswith ( " ' " )
assert ' " ' not in value
value = ' " %s " ' % value [ 1 : - 1 ]
return m . group ( 1 ) + key + m . group ( 3 ) + value
res = re . sub ( r ''' (?x)
( [ { , ] \s * )
( " [^ " ] * " | \' [^ \' ]* \' |[a-z0-9A-Z]+)
( : \s * )
( [ 0 - 9. ] + | true | false | " [^ " ] * " | \' [^ \' ]* \' | \ [| \ { )
''' , fix_kv, code)
res = re . sub ( r ' ,( \ s* \ ]) ' , lambda m : m . group ( 1 ) , res )
return res
2014-04-21 01:12:02 -04:00
def qualities ( quality_ids ) :
""" Get a numeric quality value out of a list of possible values """
def q ( qid ) :
try :
return quality_ids . index ( qid )
except ValueError :
return - 1
return q
2014-04-30 04:02:03 -04:00
DEFAULT_OUTTMPL = ' %(title)s - %(id)s . %(ext)s '
2014-05-16 06:03:59 -04:00
try :
subprocess_check_output = subprocess . check_output
except AttributeError :
def subprocess_check_output ( * args , * * kwargs ) :
assert ' input ' not in kwargs
p = subprocess . Popen ( * args , stdout = subprocess . PIPE , * * kwargs )
output , _ = p . communicate ( )
ret = p . poll ( )
if ret :
raise subprocess . CalledProcessError ( ret , p . args , output = output )
return output