From b87e01c123fd560b6a674ce00f45a9459d82d98a Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 27 May 2023 19:08:19 +1200 Subject: [PATCH] [cookies] Move `YoutubeDLCookieJar` to cookies module (#7091) Authored by: coletdjnz --- test/test_YoutubeDLCookieJar.py | 8 +- yt_dlp/YoutubeDL.py | 7 +- yt_dlp/cookies.py | 144 +++++++++++++++++++++++++++++++- yt_dlp/extractor/common.py | 2 +- yt_dlp/utils/_legacy.py | 3 + yt_dlp/utils/_utils.py | 130 ---------------------------- 6 files changed, 157 insertions(+), 137 deletions(-) diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py index 0d4e7dc97..2c73d7d85 100644 --- a/test/test_YoutubeDLCookieJar.py +++ b/test/test_YoutubeDLCookieJar.py @@ -11,7 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import re import tempfile -from yt_dlp.utils import YoutubeDLCookieJar +from yt_dlp.cookies import YoutubeDLCookieJar class TestYoutubeDLCookieJar(unittest.TestCase): @@ -47,6 +47,12 @@ class TestYoutubeDLCookieJar(unittest.TestCase): # will be ignored self.assertFalse(cookiejar._cookies) + def test_get_cookie_header(self): + cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt') + cookiejar.load(ignore_discard=True, ignore_expires=True) + header = cookiejar.get_cookie_header('https://www.foobar.foobar') + self.assertIn('HTTPONLY_COOKIE', header) + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index e1e558836..f69bc98c5 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2404,7 +2404,7 @@ class YoutubeDL: if 'Youtubedl-No-Compression' in res: # deprecated res.pop('Youtubedl-No-Compression', None) res['Accept-Encoding'] = 'identity' - cookies = self._calc_cookies(info_dict['url']) + cookies = self.cookiejar.get_cookie_header(info_dict['url']) if cookies: res['Cookie'] = cookies @@ -2416,9 +2416,8 @@ class YoutubeDL: return res def _calc_cookies(self, url): - pr = sanitized_Request(url) - self.cookiejar.add_cookie_header(pr) - return pr.get_header('Cookie') + self.deprecation_warning('"YoutubeDL._calc_cookies" is deprecated and may be removed in a future version') + return self.cookiejar.get_cookie_header(url) def _sort_thumbnails(self, thumbnails): thumbnails.sort(key=lambda t: ( diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 4cafb522e..eb6a2656b 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1,7 +1,9 @@ import base64 +import collections import contextlib import http.cookiejar import http.cookies +import io import json import os import re @@ -11,6 +13,7 @@ import subprocess import sys import tempfile import time +import urllib.request from datetime import datetime, timedelta, timezone from enum import Enum, auto from hashlib import pbkdf2_hmac @@ -29,11 +32,14 @@ from .dependencies import ( from .minicurses import MultilinePrinter, QuietMultilinePrinter from .utils import ( Popen, - YoutubeDLCookieJar, error_to_str, + escape_url, expand_path, is_path_like, + sanitize_url, + str_or_none, try_call, + write_string, ) CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} @@ -1091,3 +1097,139 @@ class LenientSimpleCookie(http.cookies.SimpleCookie): else: morsel = None + + +class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): + """ + See [1] for cookie file format. + + 1. https://curl.haxx.se/docs/http-cookies.html + """ + _HTTPONLY_PREFIX = '#HttpOnly_' + _ENTRY_LEN = 7 + _HEADER = '''# Netscape HTTP Cookie File +# This file is generated by yt-dlp. Do not edit. + +''' + _CookieFileEntry = collections.namedtuple( + 'CookieFileEntry', + ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value')) + + def __init__(self, filename=None, *args, **kwargs): + super().__init__(None, *args, **kwargs) + if is_path_like(filename): + filename = os.fspath(filename) + self.filename = filename + + @staticmethod + def _true_or_false(cndn): + return 'TRUE' if cndn else 'FALSE' + + @contextlib.contextmanager + def open(self, file, *, write=False): + if is_path_like(file): + with open(file, 'w' if write else 'r', encoding='utf-8') as f: + yield f + else: + if write: + file.truncate(0) + yield file + + def _really_save(self, f, ignore_discard=False, ignore_expires=False): + now = time.time() + for cookie in self: + if (not ignore_discard and cookie.discard + or not ignore_expires and cookie.is_expired(now)): + continue + name, value = cookie.name, cookie.value + if value is None: + # cookies.txt regards 'Set-Cookie: foo' as a cookie + # with no name, whereas http.cookiejar regards it as a + # cookie with no value. + name, value = '', name + f.write('%s\n' % '\t'.join(( + cookie.domain, + self._true_or_false(cookie.domain.startswith('.')), + cookie.path, + self._true_or_false(cookie.secure), + str_or_none(cookie.expires, default=''), + name, value + ))) + + def save(self, filename=None, *args, **kwargs): + """ + Save cookies to a file. + Code is taken from CPython 3.6 + https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """ + + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) + + # Store session cookies with `expires` set to 0 instead of an empty string + for cookie in self: + if cookie.expires is None: + cookie.expires = 0 + + with self.open(filename, write=True) as f: + f.write(self._HEADER) + self._really_save(f, *args, **kwargs) + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file.""" + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) + + def prepare_line(line): + if line.startswith(self._HTTPONLY_PREFIX): + line = line[len(self._HTTPONLY_PREFIX):] + # comments and empty lines are fine + if line.startswith('#') or not line.strip(): + return line + cookie_list = line.split('\t') + if len(cookie_list) != self._ENTRY_LEN: + raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list)) + cookie = self._CookieFileEntry(*cookie_list) + if cookie.expires_at and not cookie.expires_at.isdigit(): + raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) + return line + + cf = io.StringIO() + with self.open(filename) as f: + for line in f: + try: + cf.write(prepare_line(line)) + except http.cookiejar.LoadError as e: + if f'{line.strip()} '[0] in '[{"': + raise http.cookiejar.LoadError( + 'Cookies file must be Netscape formatted, not JSON. See ' + 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp') + write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n') + continue + cf.seek(0) + self._really_load(cf, filename, ignore_discard, ignore_expires) + # Session cookies are denoted by either `expires` field set to + # an empty string or 0. MozillaCookieJar only recognizes the former + # (see [1]). So we need force the latter to be recognized as session + # cookies on our own. + # Session cookies may be important for cookies-based authentication, + # e.g. usually, when user does not check 'Remember me' check box while + # logging in on a site, some important cookies are stored as session + # cookies so that not recognizing them will result in failed login. + # 1. https://bugs.python.org/issue17164 + for cookie in self: + # Treat `expires=0` cookies as session cookies + if cookie.expires == 0: + cookie.expires = None + cookie.discard = True + + def get_cookie_header(self, url): + """Generate a Cookie HTTP header for a given url""" + cookie_req = urllib.request.Request(escape_url(sanitize_url(url))) + self.add_cookie_header(cookie_req) + return cookie_req.get_header('Cookie') diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 1b1dd560f..306911a6c 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3444,7 +3444,7 @@ class InfoExtractor: def _get_cookies(self, url): """ Return a http.cookies.SimpleCookie with the cookies for the url """ - return LenientSimpleCookie(self._downloader._calc_cookies(url)) + return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url)) def _apply_first_set_cookie_header(self, url_handle, cookie): """ diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py index b0578a1d6..1097778f0 100644 --- a/yt_dlp/utils/_legacy.py +++ b/yt_dlp/utils/_legacy.py @@ -10,6 +10,9 @@ from ._utils import decode_base_n, preferredencoding from .traversal import traverse_obj from ..dependencies import certifi, websockets +# isort: split +from ..cookies import YoutubeDLCookieJar # noqa: F401 + has_certifi = bool(certifi) has_websockets = bool(websockets) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index d78022295..6f4f22bb3 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1518,136 +1518,6 @@ def is_path_like(f): return isinstance(f, (str, bytes, os.PathLike)) -class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): - """ - See [1] for cookie file format. - - 1. https://curl.haxx.se/docs/http-cookies.html - """ - _HTTPONLY_PREFIX = '#HttpOnly_' - _ENTRY_LEN = 7 - _HEADER = '''# Netscape HTTP Cookie File -# This file is generated by yt-dlp. Do not edit. - -''' - _CookieFileEntry = collections.namedtuple( - 'CookieFileEntry', - ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value')) - - def __init__(self, filename=None, *args, **kwargs): - super().__init__(None, *args, **kwargs) - if is_path_like(filename): - filename = os.fspath(filename) - self.filename = filename - - @staticmethod - def _true_or_false(cndn): - return 'TRUE' if cndn else 'FALSE' - - @contextlib.contextmanager - def open(self, file, *, write=False): - if is_path_like(file): - with open(file, 'w' if write else 'r', encoding='utf-8') as f: - yield f - else: - if write: - file.truncate(0) - yield file - - def _really_save(self, f, ignore_discard=False, ignore_expires=False): - now = time.time() - for cookie in self: - if (not ignore_discard and cookie.discard - or not ignore_expires and cookie.is_expired(now)): - continue - name, value = cookie.name, cookie.value - if value is None: - # cookies.txt regards 'Set-Cookie: foo' as a cookie - # with no name, whereas http.cookiejar regards it as a - # cookie with no value. - name, value = '', name - f.write('%s\n' % '\t'.join(( - cookie.domain, - self._true_or_false(cookie.domain.startswith('.')), - cookie.path, - self._true_or_false(cookie.secure), - str_or_none(cookie.expires, default=''), - name, value - ))) - - def save(self, filename=None, *args, **kwargs): - """ - Save cookies to a file. - Code is taken from CPython 3.6 - https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """ - - if filename is None: - if self.filename is not None: - filename = self.filename - else: - raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) - - # Store session cookies with `expires` set to 0 instead of an empty string - for cookie in self: - if cookie.expires is None: - cookie.expires = 0 - - with self.open(filename, write=True) as f: - f.write(self._HEADER) - self._really_save(f, *args, **kwargs) - - def load(self, filename=None, ignore_discard=False, ignore_expires=False): - """Load cookies from a file.""" - if filename is None: - if self.filename is not None: - filename = self.filename - else: - raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) - - def prepare_line(line): - if line.startswith(self._HTTPONLY_PREFIX): - line = line[len(self._HTTPONLY_PREFIX):] - # comments and empty lines are fine - if line.startswith('#') or not line.strip(): - return line - cookie_list = line.split('\t') - if len(cookie_list) != self._ENTRY_LEN: - raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list)) - cookie = self._CookieFileEntry(*cookie_list) - if cookie.expires_at and not cookie.expires_at.isdigit(): - raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) - return line - - cf = io.StringIO() - with self.open(filename) as f: - for line in f: - try: - cf.write(prepare_line(line)) - except http.cookiejar.LoadError as e: - if f'{line.strip()} '[0] in '[{"': - raise http.cookiejar.LoadError( - 'Cookies file must be Netscape formatted, not JSON. See ' - 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp') - write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n') - continue - cf.seek(0) - self._really_load(cf, filename, ignore_discard, ignore_expires) - # Session cookies are denoted by either `expires` field set to - # an empty string or 0. MozillaCookieJar only recognizes the former - # (see [1]). So we need force the latter to be recognized as session - # cookies on our own. - # Session cookies may be important for cookies-based authentication, - # e.g. usually, when user does not check 'Remember me' check box while - # logging in on a site, some important cookies are stored as session - # cookies so that not recognizing them will result in failed login. - # 1. https://bugs.python.org/issue17164 - for cookie in self: - # Treat `expires=0` cookies as session cookies - if cookie.expires == 0: - cookie.expires = None - cookie.discard = True - - class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor): def __init__(self, cookiejar=None): urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)