From ecdec1913fbe350b4dddd8b459b0f43464a8c5bc Mon Sep 17 00:00:00 2001 From: Jody Bruchon Date: Thu, 17 Sep 2020 14:22:07 -0400 Subject: [PATCH 1/7] Keep download archive in memory for better performance The old behavior was to open and scan the entire archive file for every single video download. This resulted in horrible performance for archives of any remotely large size, especially since all new video IDs are appended to the end of the archive. For anyone who uses the archive feature to maintain archives of entire video playlists or channels, this meant that all such lists with newer downloads would have to scan close to the end of the archive file before the potential download was rejected. For archives with tens of thousands of lines, this easily resulted in millions of line reads and checks over the course of scanning a single channel or playlist that had been seen previously. The new behavior in this commit is to preload the archive file into a binary search tree and scan the tree instead of constantly scanning the file on disk for every file. When a new download is appended to the archive file, it is also added to this tree. The performance is massively better using this strategy over the more "naive" line-by-line archive file parsing strategy. The only negative consequence of this change is that the archive in memory will not be synchronized with the archive file on disk. Running multiple instances of the program at the same time that all use the same archive file may result in duplicate archive entries or duplicated downloads. This is unlikely to be a serious issue for the vast majority of users. If the instances are not likely to try to download identical video IDs then this should not be a problem anyway; for example, having two instances pull two completely different YouTube channels at once should be fine. Signed-off-by: Jody Bruchon --- youtube_dlc/YoutubeDL.py | 67 ++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index 1e2070f8cc..d2d1008501 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -113,6 +113,43 @@ from .version import __version__ if compat_os_name == 'nt': import ctypes +class ArchiveTree(object): + def __init__(self, line): + self.left = None + self.right = None + self.line = line + + def at_insert(self, line): + print("at_insert: ", line) + if self.line: + if line < self.line: + if self.left is None: + self.left = ArchiveTree(line) + else: + self.left.at_insert(line) + elif line > self.line: + if self.right is None: + self.right = ArchiveTree(line) + else: + self.right.at_insert(line) + else: + self.line = line + + def at_exist(self, line): + print("at_exist: ", line) + if self.line is None: + return False + if line < self.line: + if self.left is None: + return False + return self.left.at_exist(line) + elif line > self.line: + if self.right is None: + return False + return self.right.at_exist(line) + else: + return True + class YoutubeDL(object): """YoutubeDL class. @@ -359,6 +396,21 @@ class YoutubeDL(object): } self.params.update(params) self.cache = Cache(self) + self.archive = ArchiveTree(None) + + """Preload the archive, if any is specified""" + def preload_download_archive(self): + fn = self.params.get('download_archive') + if fn is None: + return False + try: + with locked_file(fn, 'r', encoding='utf-8') as archive_file: + for line in archive_file: + self.archive.at_insert(line.strip()) + except IOError as ioe: + if ioe.errno != errno.ENOENT: + raise + return True def check_deprecated(param, option, suggestion): if self.params.get(param) is not None: @@ -367,6 +419,8 @@ class YoutubeDL(object): return True return False + preload_download_archive(self) + if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'): if self.params.get('geo_verification_proxy') is None: self.params['geo_verification_proxy'] = self.params['cn_verification_proxy'] @@ -722,7 +776,7 @@ class YoutubeDL(object): return None def _match_entry(self, info_dict, incomplete): - """ Returns None iff the file should be downloaded """ + """ Returns None if the file should be downloaded """ video_title = info_dict.get('title', info_dict.get('id', 'video')) if 'title' in info_dict: @@ -2142,15 +2196,7 @@ class YoutubeDL(object): if not vid_id: return False # Incomplete video information - try: - with locked_file(fn, 'r', encoding='utf-8') as archive_file: - for line in archive_file: - if line.strip() == vid_id: - return True - except IOError as ioe: - if ioe.errno != errno.ENOENT: - raise - return False + return self.archive.at_exist(vid_id) def record_download_archive(self, info_dict): fn = self.params.get('download_archive') @@ -2160,6 +2206,7 @@ class YoutubeDL(object): assert vid_id with locked_file(fn, 'a', encoding='utf-8') as archive_file: archive_file.write(vid_id + '\n') + self.archive.at_insert(vid_id) @staticmethod def format_resolution(format, default='unknown'): From a5029645aed1557ab6bf4cde3e70e10fc9917693 Mon Sep 17 00:00:00 2001 From: Jody Bruchon Date: Thu, 17 Sep 2020 14:46:11 -0400 Subject: [PATCH 2/7] Remove debugging print statements --- youtube_dlc/YoutubeDL.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index d2d1008501..dbf8915d05 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -120,7 +120,6 @@ class ArchiveTree(object): self.line = line def at_insert(self, line): - print("at_insert: ", line) if self.line: if line < self.line: if self.left is None: @@ -136,7 +135,6 @@ class ArchiveTree(object): self.line = line def at_exist(self, line): - print("at_exist: ", line) if self.line is None: return False if line < self.line: From 1de7ea76f8d12d301759ab7c193eb71bcbcd6a1d Mon Sep 17 00:00:00 2001 From: Jody Bruchon Date: Thu, 17 Sep 2020 15:08:33 -0400 Subject: [PATCH 3/7] Remove recursion in at_insert() --- youtube_dlc/YoutubeDL.py | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index dbf8915d05..bae42bf370 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -120,19 +120,31 @@ class ArchiveTree(object): self.line = line def at_insert(self, line): - if self.line: - if line < self.line: - if self.left is None: - self.left = ArchiveTree(line) +# print("at_insert: ", line) + cur = self + while True: +# print("comparing ", line, cur.line) + if cur.line: + if line < cur.line: + if cur.left is None: + cur.left = ArchiveTree(line) + return + else: + cur = cur.left + continue + elif line > cur.line: + if cur.right is None: + cur.right = ArchiveTree(line) + return + else: + cur = cur.right + continue else: - self.left.at_insert(line) - elif line > self.line: - if self.right is None: - self.right = ArchiveTree(line) - else: - self.right.at_insert(line) - else: - self.line = line + # Duplicate line found + return + else: + cur.line = line + return def at_exist(self, line): if self.line is None: @@ -417,6 +429,9 @@ class YoutubeDL(object): return True return False + if self.params.get('verbose'): + self.to_stdout('[debug] Loading archive file %r' % self.params.get('download_archive')) + preload_download_archive(self) if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'): From 1d74d8d9f631c8ba610f581d59e825252ab16185 Mon Sep 17 00:00:00 2001 From: Jody Bruchon Date: Thu, 17 Sep 2020 17:28:22 -0400 Subject: [PATCH 4/7] Try to mitigate the problem of loading a fully sorted archive Sorted archives turn the binary tree into a linked list and make things horribly slow. This is an incomplete mitigation for this issue. --- youtube_dlc/YoutubeDL.py | 45 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index bae42bf370..6da32960ac 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -113,12 +113,14 @@ from .version import __version__ if compat_os_name == 'nt': import ctypes +# Archive tree class ArchiveTree(object): def __init__(self, line): self.left = None self.right = None self.line = line + # Tree insertion def at_insert(self, line): # print("at_insert: ", line) cur = self @@ -130,6 +132,7 @@ class ArchiveTree(object): cur.left = ArchiveTree(line) return else: +# print("LEFT") cur = cur.left continue elif line > cur.line: @@ -137,6 +140,7 @@ class ArchiveTree(object): cur.right = ArchiveTree(line) return else: +# print("RIGHT") cur = cur.right continue else: @@ -410,16 +414,55 @@ class YoutubeDL(object): """Preload the archive, if any is specified""" def preload_download_archive(self): + lines = [] fn = self.params.get('download_archive') if fn is None: return False try: with locked_file(fn, 'r', encoding='utf-8') as archive_file: for line in archive_file: - self.archive.at_insert(line.strip()) + lines.append(line.strip()) except IOError as ioe: if ioe.errno != errno.ENOENT: raise + lmax = len(lines) + if lmax >= 4: + # Populate binary search tree by splitting the archive list in half + # and then adding from the outside edges inward + # This mitigates the worst case where the archive has been sorted + ptrLL = 0 + ptrLR = lmax // 2 + ptrRL = ptrLR + 1 + ptrRR = lmax - 1 + inserted = 0 + while True: +# print("ptrs: %d %d %d %d" % (ptrLL, ptrLR, ptrRL, ptrRR)) + if ptrLR > ptrLL: + self.archive.at_insert(lines[ptrLR]) + inserted += 1 + ptrLR -= 1; + if ptrRL < ptrRR: + self.archive.at_insert(lines[ptrRL]) + inserted += 1 + ptrRL += 1; + if ptrLL < ptrLR: + self.archive.at_insert(lines[ptrLL]) + inserted += 1 + ptrLL += 1; + if ptrRR > ptrRL: + self.archive.at_insert(lines[ptrRR]) + inserted += 1 + ptrRR -= 1; + if ptrLL == ptrLR and ptrRL == ptrRR: + print("inserted: %d, lmax: %d" % (inserted, lmax)) + break + elif lmax > 0: + # Skip multi-line logic for a single line + for idx in lines: + self.archive.at_insert(idx) + else: + # No lines were loaded + return False return True def check_deprecated(param, option, suggestion): From fda63a4e873a7e5ce8282b63e30546c2c64e0458 Mon Sep 17 00:00:00 2001 From: Jody Bruchon Date: Thu, 17 Sep 2020 21:45:40 -0400 Subject: [PATCH 5/7] Randomize archive order before populating search tree This doesn't result in an elegant, perfectly balanced search tree, but it's absolutely good enough. This commit completely mitigates the worst-case scenario where the archive file is sorted. Signed-off-by: Jody Bruchon --- youtube_dlc/YoutubeDL.py | 47 ++++++++++------------------------------ 1 file changed, 12 insertions(+), 35 deletions(-) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index 6da32960ac..03bb245ba6 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -122,17 +122,14 @@ class ArchiveTree(object): # Tree insertion def at_insert(self, line): -# print("at_insert: ", line) cur = self while True: -# print("comparing ", line, cur.line) if cur.line: if line < cur.line: if cur.left is None: cur.left = ArchiveTree(line) return else: -# print("LEFT") cur = cur.left continue elif line > cur.line: @@ -140,7 +137,6 @@ class ArchiveTree(object): cur.right = ArchiveTree(line) return else: -# print("RIGHT") cur = cur.right continue else: @@ -426,43 +422,24 @@ class YoutubeDL(object): if ioe.errno != errno.ENOENT: raise lmax = len(lines) - if lmax >= 4: + if lmax > 10: # Populate binary search tree by splitting the archive list in half # and then adding from the outside edges inward # This mitigates the worst case where the archive has been sorted - ptrLL = 0 - ptrLR = lmax // 2 - ptrRL = ptrLR + 1 - ptrRR = lmax - 1 - inserted = 0 - while True: -# print("ptrs: %d %d %d %d" % (ptrLL, ptrLR, ptrRL, ptrRR)) - if ptrLR > ptrLL: - self.archive.at_insert(lines[ptrLR]) - inserted += 1 - ptrLR -= 1; - if ptrRL < ptrRR: - self.archive.at_insert(lines[ptrRL]) - inserted += 1 - ptrRL += 1; - if ptrLL < ptrLR: - self.archive.at_insert(lines[ptrLL]) - inserted += 1 - ptrLL += 1; - if ptrRR > ptrRL: - self.archive.at_insert(lines[ptrRR]) - inserted += 1 - ptrRR -= 1; - if ptrLL == ptrLR and ptrRL == ptrRR: - print("inserted: %d, lmax: %d" % (inserted, lmax)) + pos = 0 + while pos < lmax: + if lmax - pos <= 2: break - elif lmax > 0: - # Skip multi-line logic for a single line - for idx in lines: - self.archive.at_insert(idx) - else: + target = random.randrange(pos + 1, lmax - 1) + temp = lines[pos] + lines[pos] = lines[target] + lines[target] = lines[pos] + pos += 1 + elif lmax < 1: # No lines were loaded return False + for x in lines: + self.archive.at_insert(x) return True def check_deprecated(param, option, suggestion): From a4d834fb3ef0b9f31469c8de99797063bafbac4e Mon Sep 17 00:00:00 2001 From: Jody Bruchon Date: Fri, 18 Sep 2020 00:11:36 -0400 Subject: [PATCH 6/7] Fix wrong variable in position swap corrupting archive list It's always a simple error in the end, you know? Signed-off-by: Jody Bruchon --- youtube_dlc/YoutubeDL.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index 03bb245ba6..fab56f2f86 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -423,17 +423,15 @@ class YoutubeDL(object): raise lmax = len(lines) if lmax > 10: - # Populate binary search tree by splitting the archive list in half - # and then adding from the outside edges inward - # This mitigates the worst case where the archive has been sorted pos = 0 while pos < lmax: if lmax - pos <= 2: break target = random.randrange(pos + 1, lmax - 1) + # Swap line at pos with randomly chosen target temp = lines[pos] lines[pos] = lines[target] - lines[target] = lines[pos] + lines[target] = temp pos += 1 elif lmax < 1: # No lines were loaded From 2459b6e1cf928f47b00c63cae23b4a10a912f44f Mon Sep 17 00:00:00 2001 From: Jody Bruchon Date: Fri, 18 Sep 2020 09:35:21 -0400 Subject: [PATCH 7/7] Style revisions --- youtube_dlc/YoutubeDL.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index fab56f2f86..0bdc983215 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -113,8 +113,9 @@ from .version import __version__ if compat_os_name == 'nt': import ctypes -# Archive tree + class ArchiveTree(object): + """Binary search tree for download archive entries""" def __init__(self, line): self.left = None self.right = None