Commit e3207bd6 authored by Dirley Rodrigues's avatar Dirley Rodrigues

Improve external links finder to not yield duplicate links.

--HG--
branch : distribute
extra : rebase_source : 78e932fca32ee0ee1f50794cf998f4e7db78131b
parent cf2a2832
......@@ -2,6 +2,12 @@
CHANGES
=======
------
0.6.42
------
* External links finder no longer yields duplicate links.
------
0.6.41
------
......@@ -61,6 +67,7 @@ CHANGES
0.6.35
------
Note this release is backward-incompatible with distribute 0.6.23-0.6.34 in
how it parses version numbers.
......
......@@ -139,20 +139,26 @@ REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
def find_external_links(url, page):
"""Find rel="homepage" and rel="download" links in `page`, yielding URLs"""
seen = set()
for match in REL.finditer(page):
tag, rel = match.groups()
rels = map(str.strip, rel.lower().split(','))
if 'homepage' in rels or 'download' in rels:
for match in HREF.finditer(tag):
yield urlparse.urljoin(url, htmldecode(match.group(1)))
url = urlparse.urljoin(url, htmldecode(match.group(1)))
if not url in seen:
yield url
for tag in ("<th>Home Page", "<th>Download URL"):
pos = page.find(tag)
if pos!=-1:
match = HREF.search(page,pos)
if match:
yield urlparse.urljoin(url, htmldecode(match.group(1)))
url = urlparse.urljoin(url, htmldecode(match.group(1)))
if not url in seen:
yield url
user_agent = "Python-urllib/%s distribute/%s" % (
sys.version[:3], require('distribute')[0].version
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment