diff --git a/README.md b/README.md index 4cb2dbb811..912a42f72a 100644 --- a/README.md +++ b/README.md @@ -2022,6 +2022,7 @@ While these options still work, their use is not recommended since there are oth These options are not intended to be used by the end-user --test Download only part of video for testing extractors + --load-pages Load pages dumped by --write-pages --youtube-print-sig-code For testing youtube signatures --allow-unplayable-formats List unplayable formats also --no-allow-unplayable-formats Default diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 12751b0092..d0abc395ad 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -758,6 +758,7 @@ def parse_options(argv=None): 'verbose': opts.verbose, 'dump_intermediate_pages': opts.dump_intermediate_pages, 'write_pages': opts.write_pages, + 'load_pages': opts.load_pages, 'test': opts.test, 'keepvideo': opts.keepvideo, 'min_filesize': opts.min_filesize, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 6f0de61df8..669b6bd006 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -75,7 +75,6 @@ from ..utils import ( unified_strdate, unified_timestamp, update_Request, - update_url_query, url_basename, url_or_none, urljoin, @@ -724,6 +723,11 @@ class InfoExtractor: else: return err.code in variadic(expected_status) + def _create_request(self, url_or_request, data=None, headers={}, query={}): + if not isinstance(url_or_request, compat_urllib_request.Request): + url_or_request = sanitized_Request(url_or_request) + return update_Request(url_or_request, data=data, headers=headers, query=query) + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): """ Return the response handle. @@ -755,16 +759,8 @@ class InfoExtractor: if 'X-Forwarded-For' not in headers: headers['X-Forwarded-For'] = self._x_forwarded_for_ip - if isinstance(url_or_request, compat_urllib_request.Request): - url_or_request = update_Request( - url_or_request, data=data, headers=headers, query=query) - else: - if query: - url_or_request = update_url_query(url_or_request, query) - if data is not None or headers: - url_or_request = sanitized_Request(url_or_request, data, headers) try: - return self._downloader.urlopen(url_or_request) + return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query)) except network_exceptions as err: if isinstance(err, compat_urllib_error.HTTPError): if self.__can_accept_status_code(err, expected_status): @@ -876,40 +872,44 @@ class InfoExtractor: 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', expected=True) + def _request_dump_filename(self, url, video_id): + basen = f'{video_id}_{url}' + trim_length = self.get_param('trim_file_name') or 240 + if len(basen) > trim_length: + h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() + basen = basen[:trim_length - len(h)] + h + filename = sanitize_filename(f'{basen}.dump', restricted=True) + # Working around MAX_PATH limitation on Windows (see + # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) + if compat_os_name == 'nt': + absfilepath = os.path.abspath(filename) + if len(absfilepath) > 259: + filename = fR'\\?\{absfilepath}' + return filename + + def __decode_webpage(self, webpage_bytes, encoding, headers): + if not encoding: + encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes) + try: + return webpage_bytes.decode(encoding, 'replace') + except LookupError: + return webpage_bytes.decode('utf-8', 'replace') + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): - content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() if prefix is not None: webpage_bytes = prefix + webpage_bytes - if not encoding: - encoding = self._guess_encoding_from_content(content_type, webpage_bytes) if self.get_param('dump_intermediate_pages', False): self.to_screen('Dumping request to ' + urlh.geturl()) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) - if self.get_param('write_pages', False): - basen = f'{video_id}_{urlh.geturl()}' - trim_length = self.get_param('trim_file_name') or 240 - if len(basen) > trim_length: - h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() - basen = basen[:trim_length - len(h)] + h - raw_filename = basen + '.dump' - filename = sanitize_filename(raw_filename, restricted=True) - self.to_screen('Saving request to ' + filename) - # Working around MAX_PATH limitation on Windows (see - # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if compat_os_name == 'nt': - absfilepath = os.path.abspath(filename) - if len(absfilepath) > 259: - filename = '\\\\?\\' + absfilepath + if self.get_param('write_pages'): + filename = self._request_dump_filename(video_id, urlh.geturl()) + self.to_screen(f'Saving request to {filename}') with open(filename, 'wb') as outf: outf.write(webpage_bytes) - try: - content = webpage_bytes.decode(encoding, 'replace') - except LookupError: - content = webpage_bytes.decode('utf-8', 'replace') - + content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers) self.__check_blocked(content) return content @@ -967,9 +967,21 @@ class InfoExtractor: content, urlh = res return parse(self, content, video_id, transform_source, fatal), urlh - def download_content( - self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, *args, **kwargs): - args = [url_or_request, video_id, note, errnote, transform_source, *args] + def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}, *args, **kwargs): + if self.get_param('load_pages'): + url_or_request = self._create_request(url_or_request, data, headers, query) + filename = self._request_dump_filename(url_or_request.full_url, video_id) + self.to_screen(f'Loading request from {filename}') + try: + with open(filename, 'rb') as dumpf: + webpage_bytes = dumpf.read() + except OSError as e: + self.report_warning(f'Unable to load request from disk: {e}') + else: + content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers) + return parse(self, content, video_id, transform_source, fatal) + args = [url_or_request, video_id, note, errnote, transform_source, fatal, encoding, data, headers, query, *args] if parser is None: args.pop(4) # transform_source # The method is fetched by name so subclasses can override _download_..._handle diff --git a/yt_dlp/options.py b/yt_dlp/options.py index b44f5301b9..7cffcecfa3 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1154,6 +1154,10 @@ def create_parser(): '--write-pages', action='store_true', dest='write_pages', default=False, help='Write downloaded intermediary pages to files in the current directory to debug problems') + verbosity.add_option( + '--load-pages', + action='store_true', dest='load_pages', default=False, + help=optparse.SUPPRESS_HELP) verbosity.add_option( '--youtube-print-sig-code', action='store_true', dest='youtube_print_sig_code', default=False,