[extractor/generic] Remove HEAD request

This commit is contained in:
pukkandan 2022-07-07 12:00:23 +05:30
parent 168bbc4f38
commit 6154438178
No known key found for this signature in database
GPG Key ID: 7EEE9E1E817D0A39

View File

@ -111,7 +111,6 @@ from ..compat import compat_etree_fromstring
from ..utils import ( from ..utils import (
KNOWN_EXTENSIONS, KNOWN_EXTENSIONS,
ExtractorError, ExtractorError,
HEADRequest,
UnsupportedError, UnsupportedError,
determine_ext, determine_ext,
dict_get, dict_get,
@ -124,7 +123,6 @@ from ..utils import (
orderedSet, orderedSet,
parse_duration, parse_duration,
parse_resolution, parse_resolution,
sanitized_Request,
smuggle_url, smuggle_url,
str_or_none, str_or_none,
try_call, try_call,
@ -2807,49 +2805,30 @@ class GenericIE(InfoExtractor):
else: else:
video_id = self._generic_id(url) video_id = self._generic_id(url)
self.to_screen('%s: Requesting header' % video_id) # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
# making it impossible to download only chunk of the file (yet we need only 512kB to
head_req = HEADRequest(url) # test whether it's HTML or not). According to yt-dlp default Accept-Encoding
head_response = self._request_webpage( # that will always result in downloading the whole file that is not desirable.
head_req, video_id, # Therefore for extraction pass we have to override Accept-Encoding to any in order
note=False, errnote='Could not send HEAD request to %s' % url, # to accept raw bytes and being able to download only a chunk.
fatal=False) # It may probably better to solve this by checking Content-Type for application/octet-stream
# after a HEAD request, but not sure if we can rely on this.
if head_response is not False: full_response = self._request_webpage(url, video_id, headers={'Accept-Encoding': '*'})
# Check for redirect new_url = full_response.geturl()
new_url = head_response.geturl() if url != new_url:
if url != new_url: self.report_following_redirect(new_url)
self.report_following_redirect(new_url) if force_videoid:
if force_videoid: new_url = smuggle_url(new_url, {'force_videoid': force_videoid})
new_url = smuggle_url( return self.url_result(new_url)
new_url, {'force_videoid': force_videoid})
return self.url_result(new_url)
def request_webpage():
request = sanitized_Request(url)
# Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
# making it impossible to download only chunk of the file (yet we need only 512kB to
# test whether it's HTML or not). According to yt-dlp default Accept-Encoding
# that will always result in downloading the whole file that is not desirable.
# Therefore for extraction pass we have to override Accept-Encoding to any in order
# to accept raw bytes and being able to download only a chunk.
# It may probably better to solve this by checking Content-Type for application/octet-stream
# after HEAD request finishes, but not sure if we can rely on this.
request.add_header('Accept-Encoding', '*')
return self._request_webpage(request, video_id)
full_response = None
if head_response is False:
head_response = full_response = request_webpage()
info_dict = { info_dict = {
'id': video_id, 'id': video_id,
'title': self._generic_title(url), 'title': self._generic_title(url),
'timestamp': unified_timestamp(head_response.headers.get('Last-Modified')) 'timestamp': unified_timestamp(full_response.headers.get('Last-Modified'))
} }
# Check for direct link to a video # Check for direct link to a video
content_type = head_response.headers.get('Content-Type', '').lower() content_type = full_response.headers.get('Content-Type', '').lower()
m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
if m: if m:
self.report_detected('direct video link') self.report_detected('direct video link')
@ -2878,7 +2857,6 @@ class GenericIE(InfoExtractor):
self.report_warning( self.report_warning(
'%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
full_response = full_response or request_webpage()
first_bytes = full_response.read(512) first_bytes = full_response.read(512)
# Is it an M3U playlist? # Is it an M3U playlist?
@ -4103,7 +4081,7 @@ class GenericIE(InfoExtractor):
webpage) webpage)
if not found: if not found:
# Look also in Refresh HTTP header # Look also in Refresh HTTP header
refresh_header = head_response.headers.get('Refresh') refresh_header = full_response.headers.get('Refresh')
if refresh_header: if refresh_header:
found = re.search(REDIRECT_REGEX, refresh_header) found = re.search(REDIRECT_REGEX, refresh_header)
if found: if found: