diff --git a/test/test_utils.py b/test/test_utils.py index 14607f6b8cb..50fb5f10147 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1505,6 +1505,33 @@ def headers(self): 'Content-Type': b'audio/mp3', }) self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition and unquoted filename + urlh = UrlHandle({ + 'Content-Disposition': b'attachment; filename=unquoted_filename_token.mp3', + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition including spacing and uppercase + urlh = UrlHandle({ + 'Content-Disposition': b'ATTACHMENT; FileName = unquoted_filename_token.mp3', + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition and extended filename parameter syntax + urlh = UrlHandle({ + 'Content-Disposition': b"attachment; filename*=iso8859-15''costs%201%A4%20filename.mp3", + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition and both filename parameter syntaxes + urlh = UrlHandle({ + 'Content-Disposition': b'''attachment; filename="should ignore.mp4"; + FileName* = iso8859-15''costs%201%A4%20filename.mp3''', + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition and 'wrong' order of both syntaxes + urlh = UrlHandle({ + 'Content-Disposition': b'''attachment; filename*=iso8859-15''costs%201%A4%20filename.mp3; + filename="should ignore.mp4"''', + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') if __name__ == '__main__': diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 90eb9f93c44..02631406c29 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -58,9 +58,10 @@ compat_struct_unpack, compat_urllib_error, compat_urllib_parse, + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, - compat_urllib_parse_unquote_plus, compat_urllib_request, compat_urlparse, compat_xpath, @@ -4309,9 +4310,22 @@ def encode_compat_str_or_none(x, encoding='iso-8859-1', errors='ignore'): cd = encode_compat_str_or_none(getheader('Content-Disposition')) if cd: - m = re.match(r'attachment;\s*filename="(?P[^"]+)"', cd) + m = re.match(r'''(?xi) + attachment;\s* + (?:filename\s*=[^;]+?;\s*)? # possible initial filename=...;, ignored + filename(?P\*)?\s*=\s* # filename/filename* = + (?(x)(?P\S+?)'[\w-]*'|(?P")?) # if * then charset'...' else maybe " + (?P(?(q)[^"]+(?=")|[^\s;]+)) # actual name of file + ''', cd) if m: - e = determine_ext(m.group('filename'), default_ext=None) + m = m.groupdict() + filename = m.get('filename') + if m.get('x'): + try: + filename = compat_urllib_parse_unquote(filename, encoding=m.get('charset', 'utf-8')) + except LookupError: # unrecognised character set name + pass + e = determine_ext(filename, default_ext=None) if e: return e