Commit 6f78e2f6 authored by PJ Eby's avatar PJ Eby

Reduce screenscraping required for a package index. Homepage and

download URLs can now be marked with 'rel="download"' and
'rel="homepage"' respectively, and the 'Index of Packages' string is no
longer required.  Since PyPI doesn't yet support rel="" attributes, the
old "<th>"-matching code remains, as does the MD5 scraping.
(backport from trunk)

--HG--
branch : setuptools-0.6
extra : convert_revision : svn%3A6015fed2-1504-0410-9fe1-9d1591cc4771/sandbox/branches/setuptools-0.6%4050557
parent 55742ce1
...@@ -1110,6 +1110,10 @@ Release Notes/Change History ...@@ -1110,6 +1110,10 @@ Release Notes/Change History
directories will use an internally-generated directory listing if there is directories will use an internally-generated directory listing if there is
no ``index.html`` file in the directory. no ``index.html`` file in the directory.
* Allow external links in a package index to be specified using
``rel="homepage"`` or ``rel="download"``, without needing the old
PyPI-specific visible markup.
0.6b3 0.6b3
* Fix local ``--find-links`` eggs not being copied except with * Fix local ``--find-links`` eggs not being copied except with
``--always-copy``. ``--always-copy``.
......
...@@ -120,6 +120,47 @@ def interpret_distro_name(location, basename, metadata, ...@@ -120,6 +120,47 @@ def interpret_distro_name(location, basename, metadata,
py_version=py_version, precedence = precedence, py_version=py_version, precedence = precedence,
platform = platform platform = platform
) )
REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
# this line is here to fix emacs' cruddy broken syntax highlighting
def find_external_links(url, page):
"""Find rel="homepage" and rel="download" links in `page`, yielding URLs"""
for match in REL.finditer(page):
tag, rel = match.groups()
rels = map(str.strip, rel.lower().split(','))
if 'homepage' in rels or 'download' in rels:
for match in HREF.finditer(tag):
yield urlparse.urljoin(url, match.group(1))
for tag in ("<th>Home Page", "<th>Download URL"):
pos = page.find(tag)
if pos!=-1:
match = HREF.search(page,pos)
if match:
yield urlparse.urljoin(url, match.group(1))
class PackageIndex(Environment): class PackageIndex(Environment):
"""A distribution index that scans web pages for download URLs""" """A distribution index that scans web pages for download URLs"""
...@@ -211,7 +252,7 @@ class PackageIndex(Environment): ...@@ -211,7 +252,7 @@ class PackageIndex(Environment):
parts = map( parts = map(
urllib2.unquote, link[len(self.index_url):].split('/') urllib2.unquote, link[len(self.index_url):].split('/')
) )
if len(parts)==2: if len(parts)==2 and '#' not in parts[1]:
# it's a package page, sanitize and index it # it's a package page, sanitize and index it
pkg = safe_name(parts[0]) pkg = safe_name(parts[0])
ver = safe_version(parts[1]) ver = safe_version(parts[1])
...@@ -219,30 +260,30 @@ class PackageIndex(Environment): ...@@ -219,30 +260,30 @@ class PackageIndex(Environment):
return to_filename(pkg), to_filename(ver) return to_filename(pkg), to_filename(ver)
return None, None return None, None
if url==self.index_url or 'Index of Packages</title>' in page: # process an index page into the package-page index
# process an index page into the package-page index for match in HREF.finditer(page):
for match in HREF.finditer(page): scan( urlparse.urljoin(url, match.group(1)) )
scan( urlparse.urljoin(url, match.group(1)) )
else: pkg, ver = scan(url) # ensure this page is in the page index
pkg,ver = scan(url) # ensure this page is in the page index if pkg:
# process individual package page # process individual package page
for tag in ("<th>Home Page", "<th>Download URL"): for new_url in find_external_links(url, page):
pos = page.find(tag) # Process the found URL
if pos!=-1: base, frag = egg_info_for_url(new_url)
match = HREF.search(page,pos) if base.endswith('.py') and not frag:
if match: if ver:
# Process the found URL new_url+='#egg=%s-%s' % (pkg,ver)
new_url = urlparse.urljoin(url, match.group(1)) else:
base, frag = egg_info_for_url(new_url) self.need_version_info(url)
if base.endswith('.py') and not frag: self.scan_url(new_url)
if pkg and ver:
new_url+='#egg=%s-%s' % (pkg,ver) return PYPI_MD5.sub(
else: lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1,3,2), page
self.need_version_info(url) )
self.scan_url(new_url) else:
return PYPI_MD5.sub( return "" # no sense double-scanning non-package pages
lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1,3,2), page
)
def need_version_info(self, url): def need_version_info(self, url):
self.scan_all( self.scan_all(
...@@ -273,7 +314,7 @@ class PackageIndex(Environment): ...@@ -273,7 +314,7 @@ class PackageIndex(Environment):
) )
self.scan_all() self.scan_all()
for url in self.package_pages.get(requirement.key,()): for url in list(self.package_pages.get(requirement.key,())):
# scan each page that might be related to the desired package # scan each page that might be related to the desired package
self.scan_url(url) self.scan_url(url)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment