Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit cc54c1c

Browse files
vstinnerlarryhastings
authored andcommittedJul 12, 2017
bpo-30500: urllib: Simplify splithost by calling into urlparse. (#1849) (#2291)
The current regex based splitting produces a wrong result. For example:: http://abc#@def Web browsers parse that URL as ``http://abc/#@def``, that is, the host is ``abc``, the path is ``/``, and the fragment is ``#@def``. (cherry picked from commit 90e01e5)
1 parent 71572bb commit cc54c1c

File tree

4 files changed

+45
-13
lines changed

4 files changed

+45
-13
lines changed
 

‎Lib/test/test_urlparse.py

+39-12
Original file line numberDiff line numberDiff line change
@@ -681,28 +681,35 @@ def test_default_scheme(self):
681681
def test_parse_fragments(self):
682682
# Exercise the allow_fragments parameter of urlparse() and urlsplit()
683683
tests = (
684-
("http:#frag", "path"),
685-
("//example.net#frag", "path"),
686-
("index.html#frag", "path"),
687-
(";a=b#frag", "params"),
688-
("?a=b#frag", "query"),
689-
("#frag", "path"),
684+
("http:#frag", "path", "frag"),
685+
("//example.net#frag", "path", "frag"),
686+
("index.html#frag", "path", "frag"),
687+
(";a=b#frag", "params", "frag"),
688+
("?a=b#frag", "query", "frag"),
689+
("#frag", "path", "frag"),
690+
("abc#@frag", "path", "@frag"),
691+
("//abc#@frag", "path", "@frag"),
692+
("//abc:80#@frag", "path", "@frag"),
693+
("//abc#@frag:80", "path", "@frag:80"),
690694
)
691-
for url, attr in tests:
695+
for url, attr, expected_frag in tests:
692696
for func in (urllib.parse.urlparse, urllib.parse.urlsplit):
693697
if attr == "params" and func is urllib.parse.urlsplit:
694698
attr = "path"
695699
with self.subTest(url=url, function=func):
696700
result = func(url, allow_fragments=False)
697701
self.assertEqual(result.fragment, "")
698-
self.assertTrue(getattr(result, attr).endswith("#frag"))
702+
self.assertTrue(
703+
getattr(result, attr).endswith("#" + expected_frag))
699704
self.assertEqual(func(url, "", False).fragment, "")
700705

701706
result = func(url, allow_fragments=True)
702-
self.assertEqual(result.fragment, "frag")
703-
self.assertFalse(getattr(result, attr).endswith("frag"))
704-
self.assertEqual(func(url, "", True).fragment, "frag")
705-
self.assertEqual(func(url).fragment, "frag")
707+
self.assertEqual(result.fragment, expected_frag)
708+
self.assertFalse(
709+
getattr(result, attr).endswith(expected_frag))
710+
self.assertEqual(func(url, "", True).fragment,
711+
expected_frag)
712+
self.assertEqual(func(url).fragment, expected_frag)
706713

707714
def test_mixed_types_rejected(self):
708715
# Several functions that process either strings or ASCII encoded bytes
@@ -883,6 +890,26 @@ def test_splithost(self):
883890
self.assertEqual(splithost('/foo/bar/baz.html'),
884891
(None, '/foo/bar/baz.html'))
885892

893+
# bpo-30500: # starts a fragment.
894+
self.assertEqual(splithost('//127.0.0.1#@host.com'),
895+
('127.0.0.1', '/#@host.com'))
896+
self.assertEqual(splithost('//127.0.0.1#@host.com:80'),
897+
('127.0.0.1', '/#@host.com:80'))
898+
self.assertEqual(splithost('//127.0.0.1:80#@host.com'),
899+
('127.0.0.1:80', '/#@host.com'))
900+
901+
# Empty host is returned as empty string.
902+
self.assertEqual(splithost("///file"),
903+
('', '/file'))
904+
905+
# Trailing semicolon, question mark and hash symbol are kept.
906+
self.assertEqual(splithost("//example.net/file;"),
907+
('example.net', '/file;'))
908+
self.assertEqual(splithost("//example.net/file?"),
909+
('example.net', '/file?'))
910+
self.assertEqual(splithost("//example.net/file#"),
911+
('example.net', '/file#'))
912+
886913
def test_splituser(self):
887914
splituser = urllib.parse.splituser
888915
self.assertEqual(splituser('User:Pass@www.python.org:080'),

‎Lib/urllib/parse.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,7 @@ def splithost(url):
865865
"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
866866
global _hostprog
867867
if _hostprog is None:
868-
_hostprog = re.compile('^//([^/?]*)(.*)$')
868+
_hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
869869

870870
match = _hostprog.match(url)
871871
if match:

‎Misc/ACKS

+1
Original file line numberDiff line numberDiff line change
@@ -994,6 +994,7 @@ Max Neunhöffer
994994
Anthon van der Neut
995995
George Neville-Neil
996996
Hieu Nguyen
997+
Nam Nguyen
997998
Johannes Nicolai
998999
Samuel Nicolary
9991000
Jonathan Niehof
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix urllib.parse.splithost() to correctly parse fragments. For example,
2+
``splithost('//127.0.0.1#@evil.com/')`` now correctly returns the
3+
``127.0.0.1`` host, instead of treating ``@evil.com`` as the host in an
4+
authentification (``login@host``).

0 commit comments

Comments
 (0)
Please sign in to comment.