Commit b85c8479 authored by Jeremy Hylton's avatar Jeremy Hylton

Easy optimizations of urlparse for the common case of parsing an http URL.

1. use dict.get instead of try/except KeyError
2. if the url scheme is 'http' then avoid the series of
   'if var in [someseq]:'.  instead, inline all of the code.
3. find = string.find
parent f6ae743c
...@@ -45,15 +45,40 @@ def clear_cache(): ...@@ -45,15 +45,40 @@ def clear_cache():
# (e.g. netloc is a single string) and we don't expand % escapes. # (e.g. netloc is a single string) and we don't expand % escapes.
def urlparse(url, scheme = '', allow_fragments = 1): def urlparse(url, scheme = '', allow_fragments = 1):
key = url, scheme, allow_fragments key = url, scheme, allow_fragments
try: cached = _parse_cache.get(key, None)
return _parse_cache[key] if cached:
except KeyError: return cached
pass
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
clear_cache() clear_cache()
find = string.find
netloc = path = params = query = fragment = '' netloc = path = params = query = fragment = ''
i = string.find(url, ':') i = find(url, ':')
if i > 0: if i > 0:
if url[:i] == 'http': # optimizie the common case
scheme = string.lower(url[:i])
url = url[i+1:]
if url[:2] == '//':
i = find(url, '/', 2)
if i < 0:
i = len(url)
netloc = url[2:i]
url = url[i:]
if allow_fragments:
i = string.rfind(url, '#')
if i >= 0:
url = url[:i]
fragment = url[i+1:]
i = find(url, '?')
if i >= 0:
url = url[:i]
query = url[i+1:]
i = find(url, ';')
if i >= 0:
url = url[:i]
params = url[i+1:]
tuple = scheme, netloc, url, params, query, fragment
_parse_cache[key] = tuple
return tuple
for c in url[:i]: for c in url[:i]:
if c not in scheme_chars: if c not in scheme_chars:
break break
...@@ -61,7 +86,7 @@ def urlparse(url, scheme = '', allow_fragments = 1): ...@@ -61,7 +86,7 @@ def urlparse(url, scheme = '', allow_fragments = 1):
scheme, url = string.lower(url[:i]), url[i+1:] scheme, url = string.lower(url[:i]), url[i+1:]
if scheme in uses_netloc: if scheme in uses_netloc:
if url[:2] == '//': if url[:2] == '//':
i = string.find(url, '/', 2) i = find(url, '/', 2)
if i < 0: if i < 0:
i = len(url) i = len(url)
netloc, url = url[2:i], url[i:] netloc, url = url[2:i], url[i:]
...@@ -70,11 +95,11 @@ def urlparse(url, scheme = '', allow_fragments = 1): ...@@ -70,11 +95,11 @@ def urlparse(url, scheme = '', allow_fragments = 1):
if i >= 0: if i >= 0:
url, fragment = url[:i], url[i+1:] url, fragment = url[:i], url[i+1:]
if scheme in uses_query: if scheme in uses_query:
i = string.find(url, '?') i = find(url, '?')
if i >= 0: if i >= 0:
url, query = url[:i], url[i+1:] url, query = url[:i], url[i+1:]
if scheme in uses_params: if scheme in uses_params:
i = string.find(url, ';') i = find(url, ';')
if i >= 0: if i >= 0:
url, params = url[:i], url[i+1:] url, params = url[:i], url[i+1:]
tuple = scheme, netloc, url, params, query, fragment tuple = scheme, netloc, url, params, query, fragment
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment