From c29f5a7fae93a08f3cfbb6127b2faa75145b06a0 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 24 Oct 2024 23:11:48 +0000 Subject: [PATCH] [ie/generic] Do not impersonate by default (fix edfd095b1917701c5046bd51f9542897c17d41a7) (#11336) Closes #11335 Authored by: bashonly --- README.md | 2 +- yt_dlp/extractor/generic.py | 36 ++++++++++++++++++++++++++++-------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 70bbf3da19..418203eea9 100644 --- a/README.md +++ b/README.md @@ -1791,7 +1791,7 @@ The following extractors use this feature: * `key_query`: Passthrough the master m3u8 URL query to its HLS AES-128 decryption key URI if no value is provided, or else apply the query string given as `key_query=VALUE`. Note that this will have no effect if the key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg * `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist * `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live` -* `impersonate`: Target(s) to try and impersonate with the initial webpage request; e.g. `safari,chrome-110`. By default any available target will be used. Use `false` to disable impersonation +* `impersonate`: Target(s) to try and impersonate with the initial webpage request; e.g. `generic:impersonate=safari,chrome-110`. Use `generic:impersonate` to impersonate any available target, and use `generic:impersonate=false` to disable impersonation (default) #### funimation * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese` diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 9b5421e41d..320a47772b 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -8,6 +8,8 @@ from .common import InfoExtractor from .commonprotocols import RtmpIE from .youtube import YoutubeIE from ..compat import compat_etree_fromstring +from ..cookies import LenientSimpleCookie +from ..networking.exceptions import HTTPError from ..networking.impersonate import ImpersonateTarget from ..utils import ( KNOWN_EXTENSIONS, @@ -2374,10 +2376,9 @@ class GenericIE(InfoExtractor): else: video_id = self._generic_id(url) - # Try to impersonate a web-browser by default if possible - # Skip impersonation if not available to omit the warning - impersonate = self._configuration_arg('impersonate', ['']) - if 'false' in impersonate or not self._downloader._impersonate_target_available(ImpersonateTarget()): + # Do not impersonate by default; see https://github.com/yt-dlp/yt-dlp/issues/11335 + impersonate = self._configuration_arg('impersonate', ['false']) + if 'false' in impersonate: impersonate = None # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) @@ -2388,10 +2389,29 @@ class GenericIE(InfoExtractor): # to accept raw bytes and being able to download only a chunk. # It may probably better to solve this by checking Content-Type for application/octet-stream # after a HEAD request, but not sure if we can rely on this. - full_response = self._request_webpage(url, video_id, headers=filter_dict({ - 'Accept-Encoding': 'identity', - 'Referer': smuggled_data.get('referer'), - }), impersonate=impersonate) + try: + full_response = self._request_webpage(url, video_id, headers=filter_dict({ + 'Accept-Encoding': 'identity', + 'Referer': smuggled_data.get('referer'), + }), impersonate=impersonate) + except ExtractorError as e: + if not (isinstance(e.cause, HTTPError) and e.cause.status == 403 + and e.cause.response.get_header('cf-mitigated') == 'challenge' + and e.cause.response.extensions.get('impersonate') is None): + raise + cf_cookie_domain = traverse_obj( + LenientSimpleCookie(e.cause.response.get_header('set-cookie')), + ('__cf_bm', 'domain')) + if cf_cookie_domain: + self.write_debug(f'Clearing __cf_bm cookie for {cf_cookie_domain}') + self.cookiejar.clear(domain=cf_cookie_domain, path='/', name='__cf_bm') + msg = 'Got HTTP Error 403 caused by Cloudflare anti-bot challenge; ' + if not self._downloader._impersonate_target_available(ImpersonateTarget()): + msg += ('see https://github.com/yt-dlp/yt-dlp#impersonation for ' + 'how to install the required impersonation dependency, and ') + raise ExtractorError( + f'{msg}try again with --extractor-args "generic:impersonate"', expected=True) + new_url = full_response.url if new_url != extract_basic_auth(url)[0]: self.report_following_redirect(new_url)