Commit 48bda8b1 authored by PJ Eby's avatar PJ Eby

Scrape-proof Sourceforge mirror processing!

--HG--
branch : setuptools
extra : convert_revision : svn%3A6015fed2-1504-0410-9fe1-9d1591cc4771/sandbox/trunk/setuptools%4042088
parent 5cda1e9e
...@@ -134,9 +134,9 @@ class PackageIndex(Environment): ...@@ -134,9 +134,9 @@ class PackageIndex(Environment):
def process_url(self, url, retrieve=False): def process_url(self, url, retrieve=False):
"""Evaluate a URL as a possible download, and maybe retrieve it""" """Evaluate a URL as a possible download, and maybe retrieve it"""
url = fix_sf_url(url)
if url in self.scanned_urls and not retrieve: if url in self.scanned_urls and not retrieve:
return return
self.scanned_urls[url] = True self.scanned_urls[url] = True
if not URL_SCHEME(url): if not URL_SCHEME(url):
# process filenames or directories # process filenames or directories
...@@ -296,6 +296,36 @@ class PackageIndex(Environment): ...@@ -296,6 +296,36 @@ class PackageIndex(Environment):
"; possible download problem?" "; possible download problem?"
) )
def download(self, spec, tmpdir): def download(self, spec, tmpdir):
"""Locate and/or download `spec` to `tmpdir`, returning a local path """Locate and/or download `spec` to `tmpdir`, returning a local path
...@@ -502,8 +532,6 @@ class PackageIndex(Environment): ...@@ -502,8 +532,6 @@ class PackageIndex(Environment):
def _download_html(self, url, headers, filename, tmpdir): def _download_html(self, url, headers, filename, tmpdir):
# Check for a sourceforge URL
sf_url = url.startswith('http://prdownloads.')
file = open(filename) file = open(filename)
for line in file: for line in file:
if line.strip(): if line.strip():
...@@ -513,13 +541,6 @@ class PackageIndex(Environment): ...@@ -513,13 +541,6 @@ class PackageIndex(Environment):
file.close() file.close()
os.unlink(filename) os.unlink(filename)
return self._download_svn(url, filename) return self._download_svn(url, filename)
# Check for a SourceForge header
elif sf_url:
page = ''.join(list(file))
if '?use_mirror=' in page:
file.close()
os.unlink(filename)
return self._download_sourceforge(url, page, tmpdir)
break # not an index page break # not an index page
file.close() file.close()
os.unlink(filename) os.unlink(filename)
...@@ -539,45 +560,44 @@ class PackageIndex(Environment): ...@@ -539,45 +560,44 @@ class PackageIndex(Environment):
def warn(self, msg, *args): def warn(self, msg, *args):
log.warn(msg, *args) log.warn(msg, *args)
def fix_sf_url(url):
scheme, server, path, param, query, frag = urlparse.urlparse(url)
if server!='prdownloads.sourceforge.net':
return url
return urlparse.urlunparse(
(scheme, 'dl.sourceforge.net', 'sourceforge'+path, param, '', frag)
)
def _download_sourceforge(self, source_url, sf_page, tmpdir):
"""Download package from randomly-selected SourceForge mirror"""
self.debug("Processing SourceForge mirror page")
mirror_regex = re.compile(r'HREF="?(/.*?\?use_mirror=[^">]*)', re.I)
urls = [m.group(1) for m in mirror_regex.finditer(sf_page)]
if not urls:
raise DistutilsError(
"URL looks like a Sourceforge mirror page, but no URLs found"
)
import random
url = urlparse.urljoin(source_url, random.choice(urls))
self.info(
"Requesting redirect to (randomly selected) %r mirror",
url.split('=',1)[-1]
)
f = self.open_url(url)
match = re.search(
r'(?i)<META HTTP-EQUIV="refresh" content=".*?URL=(.*?)"',
f.read()
)
f.close()
if match:
download_url = match.group(1)
scheme = URL_SCHEME(download_url)
return self._download_url(scheme.group(1), download_url, tmpdir)
else:
raise DistutilsError(
'No META HTTP-EQUIV="refresh" found in Sourceforge page at %s'
% url
)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment