From 79a2e94e79e65cdf4898bc2dedb6a1bb4ca9af3c Mon Sep 17 00:00:00 2001 From: Adam Thalhammer Date: Mon, 2 May 2016 13:21:39 +1000 Subject: [PATCH 1/3] Instead of replacing accented characters with an underscore when sanitizing file names in restricted mode, replace them with their non-accented equivalents fixes #9347 --- test/test_utils.py | 9 +++++++-- youtube_dl/utils.py | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index e16a6761b7..0072ba2419 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -139,8 +139,8 @@ class TestUtil(unittest.TestCase): self.assertEqual('yes_no', sanitize_filename('yes? no', restricted=True)) self.assertEqual('this_-_that', sanitize_filename('this: that', restricted=True)) - tests = 'a\xe4b\u4e2d\u56fd\u7684c' - self.assertEqual(sanitize_filename(tests, restricted=True), 'a_b_c') + tests = 'aäb\u4e2d\u56fd\u7684c' + self.assertEqual(sanitize_filename(tests, restricted=True), 'aab_c') self.assertTrue(sanitize_filename('\xf6', restricted=True) != '') # No empty filename forbidden = '"\0\\/&!: \'\t\n()[]{}$;`^,#' @@ -155,6 +155,11 @@ class TestUtil(unittest.TestCase): self.assertTrue(sanitize_filename('-', restricted=True) != '') self.assertTrue(sanitize_filename(':', restricted=True) != '') + self.assertEqual(sanitize_filename( + 'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', restricted=True), + 'AAAAAAAECEEEEIIIIDNOOOOOOUUUUYPssaaaaaaaeceeeeiiiionoooooouuuuypy') + pass + def test_sanitize_ids(self): self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw') self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7bcc85e2b5..f74f622680 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -14,8 +14,8 @@ import email.utils import errno import functools import gzip -import itertools import io +import itertools import json import locale import math @@ -24,8 +24,8 @@ import os import pipes import platform import re -import ssl import socket +import ssl import struct import subprocess import sys @@ -365,6 +365,11 @@ def sanitize_filename(s, restricted=False, is_id=False): Set is_id if this is not an arbitrary string, but an ID that should be kept if possible """ def replace_insane(char): + accents = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy'))) + if restricted and char in accents: + return accents[char] if char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': From 31c4448f6ea8ec4236e0b67af33b53db1a93ef13 Mon Sep 17 00:00:00 2001 From: Adam Thalhammer Date: Mon, 2 May 2016 13:25:12 +1000 Subject: [PATCH 2/3] Instead of replacing accented characters with an underscore when sanitizing file names in restricted mode, replace them with their non-accented equivalents fixes #9347 --- test/test_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 0072ba2419..00ada95eca 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -158,7 +158,6 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_filename( 'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', restricted=True), 'AAAAAAAECEEEEIIIIDNOOOOOOUUUUYPssaaaaaaaeceeeeiiiionoooooouuuuypy') - pass def test_sanitize_ids(self): self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw') From c587cbb793a6eda4fc7b1c7b4163e236abec1a00 Mon Sep 17 00:00:00 2001 From: Adam Thalhammer Date: Tue, 3 May 2016 10:40:30 +1000 Subject: [PATCH 3/3] improved performance by extracting accented chars to top level --- youtube_dl/utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f74f622680..a5922b2b53 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -89,6 +89,11 @@ KNOWN_EXTENSIONS = ( 'wav', 'f4f', 'f4m', 'm3u8', 'smil') +# needed for sanitizing filenames in restricted mode +ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy'))) + def preferredencoding(): """Get preferred encoding. @@ -365,11 +370,8 @@ def sanitize_filename(s, restricted=False, is_id=False): Set is_id if this is not an arbitrary string, but an ID that should be kept if possible """ def replace_insane(char): - accents = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', - itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'], - 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy'))) - if restricted and char in accents: - return accents[char] + if restricted and char in ACCENT_CHARS: + return ACCENT_CHARS[char] if char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"':