Commit e6572cb2 authored by PJ Eby's avatar PJ Eby

Implement PyPI screenscraping for EasyInstall. Fix a bug in requirement

version checking.  Document new options for screen scraping.

--HG--
branch : setuptools
extra : convert_revision : svn%3A6015fed2-1504-0410-9fe1-9d1591cc4771/sandbox/trunk/setuptools%4041041
parent 0c9e8869
......@@ -210,10 +210,42 @@ Command-Line Options
URL or filename, so that the installer will not be confused by the presence
of multiple ``setup.py`` files in the build directory.
``--scan-url=URL, -s URL`` (New in 0.4a1)
Scan the specified "download page" for direct links to downloadable eggs or
source distributions. Any usable packages will be downloaded if they are
required by a command line argument. For example, this::
easy_install -s http://peak.telecommunity.com/dist PyProtocols
will download and install the latest version of PyProtocols linked from
the PEAK downloads page, but ignore the other download links on that page.
You may use this option more than once, to list multiple download pages.
If all requested packages can be found using the specified download pages,
the Python Package Index will *not* be consulted.
``--index-url=URL, -u URL`` (New in 0.4a1)
Specifies the base URL of the Python Package Index. The default is
http://www.python.org/pypi if not specified. When a package is requested
that is not locally available or linked from a ``--scan-url`` download
page, the package index will be searched for download pages for the needed
package, and those download pages will be searched for links to download
an egg or source distribution.
Release Notes/Change History
============================
0.4a1
* Added ``--scan-url`` and ``--index-url`` options, to scan download pages
and search PyPI for needed packages.
* Fixed a bug in requirements processing for exact versions (i.e. ``==`` and
``!=``) when only one condition was included.
* Added ``safe_name()`` and ``safe_version()`` APIs to clean up handling of
arbitrary distribution names and versions found on PyPI.
0.3a4
* ``pkg_resources`` now supports resource directories, not just the resources
in them. In particular, there are ``resource_listdir()`` and
......
......@@ -28,139 +28,139 @@ import __builtin__
from distutils.sysconfig import get_python_lib
from shutil import rmtree # must have, because it can be called from __del__
from pkg_resources import *
_os = sys.modules[os.name]
_open = open
EXTENSIONS = (
(EGG_DIST, ".egg"),
(SOURCE_DIST, ".tar.gz"),
(SOURCE_DIST, ".tar.bz2"),
(SOURCE_DIST, ".tar"),
(SOURCE_DIST, ".zip"),
(SOURCE_DIST, ".tgz"),
)
class URLDistribution(Distribution):
"""A distribution that has not been installed"""
def __init__(self, url, metadata=None):
path = urlparse.urlparse(url)[2]
base = path.split('/')[-1]
for typecode, ext in EXTENSIONS:
if base.endswith(ext):
base = base[:-len(ext)]
break
else:
raise DistributionNotFound(url)
self.typecode = typecode
name, version, py_version, platform = [None]*4
match = pkg_resources.EGG_NAME(base)
if match:
name,version,py_version,platform = match.group(
'name','ver','pyver','plat'
)
else:
name = base
Distribution.__init__(self,
url, metadata=metadata, name=name, version=version or "0",
py_version=py_version or pkg_resources.PY_MAJOR, platform=platform
class Opener(urllib.FancyURLopener):
def http_error_default(self, url, fp, errcode, errmsg, headers):
"""Default error handling -- don't raise an exception."""
info = urllib.addinfourl(fp, headers, "http:" + url)
info.status, info.reason = errcode, errmsg
return info
opener = Opener()
HREF = re.compile(r"""href\s*=\s*['"]?([^'"> ]+)""", re.I)
EXTENSIONS = ".tar.gz .tar.bz2 .tar .zip .tgz".split()
def distros_for_url(url, metadata=None):
"""Yield egg or source distribution objects that might be found at a URL"""
path = urlparse.urlparse(url)[2]
base = urllib.unquote(path.split('/')[-1])
if base.endswith('.egg'):
dist = Distribution.from_filename(base, metadata)
dist.path = url
yield dist
return # only one, unambiguous interpretation
for ext in EXTENSIONS:
if base.endswith(ext):
base = base[:-len(ext)]
break
else:
return # no extension matched
# Generate alternative interpretations of a source distro name
# Because some packages are ambiguous as to name/versions split
# e.g. "adns-python-1.1.0", "egenix-mx-commercial", etc.
# So, we generate each possible interepretation (e.g. "adns, python-1.1.0"
# "adns-python, 1.1.0", and "adns-python-1.1.0, no version"). In practice,
# the spurious interpretations should be ignored, because in the event
# there's also an "adns" package, the spurious "python-1.1.0" version will
# compare lower than any numeric version number, and is therefore unlikely
# to match a request for it. It's still a potential problem, though, and
# in the long run PyPI and the distutils should go for "safe" names and
# versions in source distribution names.
parts = base.split('-')
for p in range(1,len(parts)+1):
yield Distribution(
url, metadata, '-'.join(parts[:p]), '-'.join(parts[p:]),
distro_type = SOURCE_DIST
)
class PackageIndex(AvailableDistributions):
"""A distribution index that scans web pages for download URLs"""
def __init__(self,index_url="http://www.python.org/pypi",*args,**kw):
AvailableDistributions.__init__(self,*args,**kw)
self.index_url = index_url
self.index_url = index_url + "/"[:not index_url.endswith('/')]
self.scanned_urls = {}
self.fetched_urls = {}
self.package_pages = {}
def scan_url(self, url):
self.process_url(url, True)
def process_url(self, url, retrieve=False):
if url in self.scanned_urls:
return
try:
dist = URLDistribution(url)
except DistributionNotFound: # not a distro, so scan the page
if not retrieve:
return # unless we're skipping retrieval
else:
# It's a distro, just process it
self.scanned_urls[url] = True
self.add(dist) # XXX should check py_ver/platform!
if url in self.scanned_urls and not retrieve:
return
f = urllib.urlopen(url)
self.scanned_urls[url] = True
dists = list(distros_for_url(url))
map(self.add, dists)
if dists or not retrieve or url in self.fetched_urls:
# don't need the actual page
return
f = opener.open(url)
self.fetched_urls[url] = self.fetched_urls[f.url] = True
if 'html' not in f.headers['content-type'].lower():
f.close() # not html, we can't process it
return
url = f.url # handle redirects
href = re.compile(r"""href\s*=\s*['"]?([^'"> ]+)""", re.I)
base = f.url # handle redirects
page = f.read()
f.close()
for match in href.finditer(page):
link = urlparse.urljoin(url, match.group(1))
self.process_url(link)
if url.startswith(self.index_url):
self.process_index(url, page)
else:
for match in HREF.finditer(page):
link = urlparse.urljoin(base, match.group(1))
self.process_url(link)
def find_packages(self,requirement):
self.scan_url(self.index_url + requirement.distname)
if not self.package_pages.get(requirement.key):
# We couldn't find the target package, so search the index page too
self.scan_url(self.index_url)
for url in self.package_pages.get(requirement.key,()):
# scan each page that might be related to the desired package
self.scan_url(url)
def process_index(self,url,page):
def scan(link):
if link.startswith(self.index_url):
parts = map(
urllib.unquote, link[len(self.index_url):].split('/')
)
if len(parts)==2:
# it's a package page, sanitize and index it
pkg = safe_name(parts[0])
ver = safe_version(parts[1])
self.package_pages.setdefault(pkg.lower(),{})[link] = True
if url==self.index_url or 'Index of Packages</title>' in page:
# process an index page into the package-page index
for match in HREF.finditer(page):
scan( urlparse.urljoin(url, match.group(1)) )
else:
scan(url) # ensure this page is in the page index
# process individual package page
for tag in ("<th>Home Page", "<th>Download URL"):
pos = page.find(tag)
if pos!=-1:
match = HREF.search(page,pos)
if match:
# Process the found URL
self.scan_url(urlparse.urljoin(url, match.group(1)))
def obtain(self,requirement):
self.find_packages(requirement)
for dist in self.get(requirement.key, ()):
if dist in requirement:
return dist
def find_packages(self,requirement):
pass # XXX process PyPI entries for package
class Installer:
"""Manage a download/build/install process"""
......@@ -801,11 +801,11 @@ def main(argv, factory=Installer):
parser.add_option("-u", "--index-url", dest="index_url", metavar="URL",
default="http://www.python.org/pypi",
help="Base URL of Python Package Index")
help="base URL of Python Package Index")
parser.add_option("-s", "--scan-url", dest="scan_urls", metavar="URL",
action="append",
help="Additional URL(s) to search for packages")
help="additional URL(s) to search for packages")
(options, args) = parser.parse_args()
......
......@@ -54,6 +54,9 @@ class InvalidOption(ResolutionError):
_provider_factories = {}
PY_MAJOR = sys.version[:3]
EGG_DIST = 2
SOURCE_DIST = 1
def register_loader_type(loader_type, provider_factory):
"""Register `provider_factory` to make providers for `loader_type`
......@@ -77,9 +80,6 @@ def get_provider(moduleName):
def get_platform():
"""Return this platform's string for platform-specific distributions
......@@ -1146,23 +1146,21 @@ def parse_version(s):
EGG_DIST = 2
SOURCE_DIST = 1
class Distribution(object):
"""Wrap an actual or potential sys.path entry w/metadata"""
typecode = EGG_DIST
def __init__(self,
path_str, metadata=None, name=None, version=None,
py_version=PY_MAJOR, platform=None
py_version=PY_MAJOR, platform=None, distro_type = EGG_DIST
):
if name:
self.name = name.replace('_','-')
self.name = safe_name(name)
if version:
self._version = version.replace('_','-')
self._version = safe_version(version)
self.py_version = py_version
self.platform = platform
self.path = path_str
self.distro_type = distro_type
self.metadata = metadata
def installed_on(self,path=None):
......@@ -1187,6 +1185,8 @@ class Distribution(object):
)
from_filename = classmethod(from_filename)
# These properties have to be lazy so that we don't have to load any
# metadata until/unless it's actually needed. (i.e., some distributions
# may not know their name or version without loading PKG-INFO)
......@@ -1330,7 +1330,7 @@ def parse_requirements(strs):
def _sort_dists(dists):
tmp = [(dist.version,dist.typecode,dist) for dist in dists]
tmp = [(dist.version,dist.distro_type,dist) for dist in dists]
tmp.sort()
dists[::-1] = [d for v,t,d in tmp]
......@@ -1382,16 +1382,16 @@ class Requirement:
item = item.parsed_version
elif isinstance(item,basestring):
item = parse_version(item)
last = True
last = None
for parsed,trans,op,ver in self.index:
action = trans[cmp(item,parsed)]
if action=='F': return False
elif action=='T': return True
elif action=='+': last = True
elif action=='-': last = False
elif action=='-' or last is None: last = False
if last is None: last = True # no rules encountered
return last
def __hash__(self):
return self.__hash
......@@ -1414,7 +1414,7 @@ state_machine = {
'>' : 'F+F',
'>=': 'T+F',
'==': 'T..',
'!=': 'F..',
'!=': 'F++',
}
......
......@@ -258,12 +258,19 @@ class RequirementsTests(TestCase):
ImmutableSet(["foo","bar"])))
)
def testVersionEquality(self):
r1 = Requirement.parse("setuptools==0.3a2")
r2 = Requirement.parse("setuptools!=0.3a4")
d = Distribution.from_filename
self.failIf(d("setuptools-0.3a4.egg") in r1)
self.failIf(d("setuptools-0.3a1.egg") in r1)
self.failIf(d("setuptools-0.3a4.egg") in r2)
self.failUnless(d("setuptools-0.3a2.egg") in r1)
self.failUnless(d("setuptools-0.3a2.egg") in r2)
self.failUnless(d("setuptools-0.3a3.egg") in r2)
self.failUnless(d("setuptools-0.3a5.egg") in r2)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment