urllib.py 52.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
"""Open an arbitrary URL.

See the following document for more info on URLs:
"Names and Addresses, URIs, URLs, URNs, URCs", at
http://www.w3.org/pub/WWW/Addressing/Overview.html

See also the HTTP spec (from which the error codes are derived):
"HTTP - Hypertext Transfer Protocol", at
http://www.w3.org/pub/WWW/Protocols/

Related standards and specs:
- RFC1808: the "relative URL" spec. (authoritative status)
- RFC1738 - the "URL standard". (authoritative status)
- RFC1630 - the "URI spec". (informational status)

The object returned by URLopener().open(file) will differ per
protocol.  All you know is that is has methods read(), readline(),
readlines(), fileno(), close() and info().  The read*(), fileno()
19
and close() methods work like those of open files.
20 21 22 23
The info() method returns a mimetools.Message object which can be
used to query various info about the object, if available.
(mimetools.Message objects are queried with the getheader() method.)
"""
24 25

import socket
26
import os
27
import time
28
import sys
29
from urlparse import urljoin as basejoin
30

31 32
__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
           "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
33 34 35 36
           "urlencode", "url2pathname", "pathname2url", "splittag",
           "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
           "splittype", "splithost", "splituser", "splitpasswd", "splitport",
           "splitnport", "splitquery", "splitattr", "splitvalue",
37
           "getproxies"]
38

39
__version__ = '1.17'    # XXX This version is not always updated :-(
40

41
MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
42

43 44
# Helper for non-unix systems
if os.name == 'mac':
45
    from macurl2path import url2pathname, pathname2url
46
elif os.name == 'nt':
47
    from nturl2path import url2pathname, pathname2url
48 49
elif os.name == 'riscos':
    from rourl2path import url2pathname, pathname2url
50
else:
51
    def url2pathname(pathname):
52 53
        """OS-specific conversion from a relative URL of the 'file' scheme
        to a file system path; not recommended for general use."""
Guido van Rossum's avatar
Guido van Rossum committed
54
        return unquote(pathname)
55

56
    def pathname2url(pathname):
57 58
        """OS-specific conversion from a file system path to a relative URL
        of the 'file' scheme; not recommended for general use."""
Guido van Rossum's avatar
Guido van Rossum committed
59
        return quote(pathname)
Guido van Rossum's avatar
Guido van Rossum committed
60

61 62 63 64 65 66 67 68 69
# This really consists of two pieces:
# (1) a class which handles opening of all sorts of URLs
#     (plus assorted utilities etc.)
# (2) a set of functions for parsing URLs
# XXX Should these be separated out into different modules?


# Shortcut for basic usage
_urlopener = None
70
def urlopen(url, data=None, proxies=None):
71
    """urlopen(url [, data]) -> open file-like object"""
72
    global _urlopener
73 74 75 76 77 78 79
    if proxies is not None:
        opener = FancyURLopener(proxies=proxies)
    elif not _urlopener:
        opener = FancyURLopener()
        _urlopener = opener
    else:
        opener = _urlopener
80
    if data is None:
81
        return opener.open(url)
82
    else:
83
        return opener.open(url, data)
84

85
def urlretrieve(url, filename=None, reporthook=None, data=None):
86 87 88
    global _urlopener
    if not _urlopener:
        _urlopener = FancyURLopener()
89
    return _urlopener.retrieve(url, filename, reporthook, data)
90

91
def urlcleanup():
92 93
    if _urlopener:
        _urlopener.cleanup()
94

95 96 97 98 99
# exception raised when downloaded size does not match content-length
class ContentTooShortError(IOError):
    def __init__(self, message, content):
        IOError.__init__(self, message)
        self.content = content
100 101 102

ftpcache = {}
class URLopener:
103 104 105 106 107 108
    """Class to open URLs.
    This is a class rather than just a subroutine because we may need
    more than one set of global protocol-specific options.
    Note -- this is a base class for those who don't want the
    automatic handling of errors type 302 (relocated) and 401
    (authorization needed)."""
109

110 111
    __tempfiles = None

112 113
    version = "Python-urllib/%s" % __version__

114
    # Constructor
115
    def __init__(self, proxies=None, **x509):
116 117
        if proxies is None:
            proxies = getproxies()
118
        assert hasattr(proxies, 'keys'), "proxies must be a mapping"
119
        self.proxies = proxies
120 121
        self.key_file = x509.get('key_file')
        self.cert_file = x509.get('cert_file')
122
        self.addheaders = [('User-Agent', self.version)]
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
        self.__tempfiles = []
        self.__unlink = os.unlink # See cleanup()
        self.tempcache = None
        # Undocumented feature: if you assign {} to tempcache,
        # it is used to cache files retrieved with
        # self.retrieve().  This is not enabled by default
        # since it does not work for changing documents (and I
        # haven't got the logic to check expiration headers
        # yet).
        self.ftpcache = ftpcache
        # Undocumented feature: you can use a different
        # ftp cache by assigning to the .ftpcache member;
        # in case you want logically independent URL openers
        # XXX This is not threadsafe.  Bah.

    def __del__(self):
        self.close()

    def close(self):
        self.cleanup()

    def cleanup(self):
        # This code sometimes runs when the rest of this module
        # has already been deleted, so it can't use any globals
        # or import anything.
        if self.__tempfiles:
            for file in self.__tempfiles:
                try:
                    self.__unlink(file)
152
                except OSError:
153 154 155 156 157 158
                    pass
            del self.__tempfiles[:]
        if self.tempcache:
            self.tempcache.clear()

    def addheader(self, *args):
159 160
        """Add a header to be used by the HTTP interface only
        e.g. u.addheader('Accept', 'sound/basic')"""
161 162 163 164
        self.addheaders.append(args)

    # External interface
    def open(self, fullurl, data=None):
165
        """Use URLopener().open(file) instead of open(file, 'r')."""
166
        fullurl = unwrap(toBytes(fullurl))
167
        if self.tempcache and fullurl in self.tempcache:
168 169 170
            filename, headers = self.tempcache[fullurl]
            fp = open(filename, 'rb')
            return addinfourl(fp, headers, fullurl)
171 172 173
        urltype, url = splittype(fullurl)
        if not urltype:
            urltype = 'file'
174
        if urltype in self.proxies:
175 176
            proxy = self.proxies[urltype]
            urltype, proxyhost = splittype(proxy)
177
            host, selector = splithost(proxyhost)
178
            url = (host, fullurl) # Signal special case to open_*()
179 180
        else:
            proxy = None
181 182
        name = 'open_' + urltype
        self.type = urltype
183
        name = name.replace('-', '_')
184
        if not hasattr(self, name):
185 186
            if proxy:
                return self.open_unknown_proxy(proxy, fullurl, data)
187 188 189 190 191 192 193
            else:
                return self.open_unknown(fullurl, data)
        try:
            if data is None:
                return getattr(self, name)(url)
            else:
                return getattr(self, name)(url, data)
194
        except socket.error as msg:
195 196 197
            raise IOError, ('socket error', msg), sys.exc_info()[2]

    def open_unknown(self, fullurl, data=None):
198
        """Overridable interface to open unknown URL type."""
199 200 201
        type, url = splittype(fullurl)
        raise IOError, ('url error', 'unknown url type', type)

202 203 204 205 206
    def open_unknown_proxy(self, proxy, fullurl, data=None):
        """Overridable interface to open unknown URL type."""
        type, url = splittype(fullurl)
        raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)

207
    # External interface
208
    def retrieve(self, url, filename=None, reporthook=None, data=None):
209
        """retrieve(url) returns (filename, headers) for a local object
210
        or (tempfilename, headers) for a remote object."""
211
        url = unwrap(toBytes(url))
212
        if self.tempcache and url in self.tempcache:
213 214
            return self.tempcache[url]
        type, url1 = splittype(url)
215
        if filename is None and (not type or type == 'file'):
216 217 218 219 220
            try:
                fp = self.open_local_file(url1)
                hdrs = fp.info()
                del fp
                return url2pathname(splithost(url1)[1]), hdrs
221
            except IOError as msg:
222
                pass
223
        fp = self.open(url, data)
224
        headers = fp.info()
225 226 227
        if filename:
            tfp = open(filename, 'wb')
        else:
228 229 230 231 232 233
            import tempfile
            garbage, path = splittype(url)
            garbage, path = splithost(path or "")
            path, garbage = splitquery(path or "")
            path, garbage = splitattr(path or "")
            suffix = os.path.splitext(path)[1]
234
            (fd, filename) = tempfile.mkstemp(suffix)
235
            self.__tempfiles.append(filename)
236
            tfp = os.fdopen(fd, 'wb')
237 238 239 240 241
        result = filename, headers
        if self.tempcache is not None:
            self.tempcache[url] = result
        bs = 1024*8
        size = -1
242
        read = 0
243
        blocknum = 0
244
        if reporthook:
245
            if "content-length" in headers:
246
                size = int(headers["Content-Length"])
247 248
            reporthook(blocknum, bs, size)
        while 1:
249
            block = fp.read(bs)
250
            if not block:
251
                break
252
            read += len(block)
253
            tfp.write(block)
254
            blocknum += 1
255 256 257 258 259 260
            if reporthook:
                reporthook(blocknum, bs, size)
        fp.close()
        tfp.close()
        del fp
        del tfp
261 262 263 264 265 266

        # raise exception if actual size does not match content-length header
        if size >= 0 and read < size:
            raise ContentTooShortError("retrieval incomplete: got only %i out "
                                       "of %i bytes" % (read, size), result)

267 268 269 270 271
        return result

    # Each method named open_<type> knows how to open that type of URL

    def open_http(self, url, data=None):
272
        """Use HTTP protocol."""
273 274
        import httplib
        user_passwd = None
275
        proxy_passwd= None
276
        if isinstance(url, str):
277 278 279 280 281 282 283
            host, selector = splithost(url)
            if host:
                user_passwd, host = splituser(host)
                host = unquote(host)
            realhost = host
        else:
            host, selector = url
284 285 286
            # check whether the proxy contains authorization information
            proxy_passwd, host = splituser(host)
            # now we proceed with the url we want to obtain
287 288 289
            urltype, rest = splittype(selector)
            url = rest
            user_passwd = None
290
            if urltype.lower() != 'http':
291 292 293 294 295 296 297
                realhost = None
            else:
                realhost, rest = splithost(rest)
                if realhost:
                    user_passwd, realhost = splituser(realhost)
                if user_passwd:
                    selector = "%s://%s%s" % (urltype, realhost, rest)
298 299 300
                if proxy_bypass(realhost):
                    host = realhost

301 302
            #print "proxy via http:", host, selector
        if not host: raise IOError, ('http error', 'no host given')
Tim Peters's avatar
Tim Peters committed
303

304 305
        if proxy_passwd:
            import base64
306
            proxy_auth = base64.b64encode(proxy_passwd).strip()
307 308 309
        else:
            proxy_auth = None

310 311
        if user_passwd:
            import base64
312
            auth = base64.b64encode(user_passwd).strip()
313 314
        else:
            auth = None
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
        http_conn = httplib.HTTPConnection(host)
        # XXX We should fix urllib so that it works with HTTP/1.1.
        http_conn._http_vsn = 10
        http_conn._http_vsn_str = "HTTP/1.0"

        headers = {}
        if proxy_auth:
            headers["Proxy-Authorization"] = "Basic %s" % proxy_auth
        if auth:
            headers["Authorization"] =  "Basic %s" % auth
        if realhost:
            headers["Host"] = realhost
        for header, value in self.addheaders:
            headers[header] = value

330
        if data is not None:
331 332
            headers["Content-Type"] = "application/x-www-form-urlencoded"
            http_conn.request("POST", selector, data, headers)
333
        else:
334 335 336 337 338
            http_conn.request("GET", selector, headers=headers)

        try:
            response = http_conn.getresponse()
        except httplib.BadStatusLine:
339
            # something went wrong with the HTTP status line
340 341 342 343 344
            raise IOError('http protocol error', 0,
                          'got a bad status line', None)

        if response.status == 200:
            return addinfourl(response.fp, response.msg, "http:" + url)
345
        else:
346 347 348
            return self.http_error(
                url, response.fp,
                response.status, response.reason, response.msg, data)
349 350

    def http_error(self, url, fp, errcode, errmsg, headers, data=None):
351
        """Handle http errors.
352

353 354
        Derived class can override this, or provide specific handlers
        named http_error_DDD where DDD is the 3-digit error code."""
355 356 357 358 359 360
        # First check if there's a specific handler for this error
        name = 'http_error_%d' % errcode
        if hasattr(self, name):
            method = getattr(self, name)
            if data is None:
                result = method(url, fp, errcode, errmsg, headers)
361 362
            else:
                result = method(url, fp, errcode, errmsg, headers, data)
363
            if result: return result
364
        return self.http_error_default(url, fp, errcode, errmsg, headers)
365 366

    def http_error_default(self, url, fp, errcode, errmsg, headers):
367
        """Default error handler: close the connection and raise IOError."""
368 369 370 371
        void = fp.read()
        fp.close()
        raise IOError, ('http error', errcode, errmsg, headers)

372
    if hasattr(socket, "ssl"):
373
        def open_https(self, url, data=None):
374
            """Use HTTPS protocol."""
375
            import httplib
376
            user_passwd = None
377
            proxy_passwd = None
378
            if isinstance(url, str):
379
                host, selector = splithost(url)
380 381 382 383
                if host:
                    user_passwd, host = splituser(host)
                    host = unquote(host)
                realhost = host
384 385
            else:
                host, selector = url
386 387
                # here, we determine, whether the proxy contains authorization information
                proxy_passwd, host = splituser(host)
388
                urltype, rest = splittype(selector)
389 390
                url = rest
                user_passwd = None
391
                if urltype.lower() != 'https':
392 393
                    realhost = None
                else:
394
                    realhost, rest = splithost(rest)
395 396
                    if realhost:
                        user_passwd, realhost = splituser(realhost)
397 398
                    if user_passwd:
                        selector = "%s://%s%s" % (urltype, realhost, rest)
399
                #print "proxy via https:", host, selector
400
            if not host: raise IOError, ('https error', 'no host given')
401 402
            if proxy_passwd:
                import base64
403
                proxy_auth = base64.b64encode(proxy_passwd).strip()
404 405
            else:
                proxy_auth = None
406 407
            if user_passwd:
                import base64
408
                auth = base64.b64encode(user_passwd).strip()
409 410 411 412 413
            else:
                auth = None
            h = httplib.HTTPS(host, 0,
                              key_file=self.key_file,
                              cert_file=self.cert_file)
414 415
            if data is not None:
                h.putrequest('POST', selector)
416
                h.putheader('Content-Type',
417
                            'application/x-www-form-urlencoded')
418
                h.putheader('Content-Length', '%d' % len(data))
419 420
            else:
                h.putrequest('GET', selector)
421 422
            if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
            if auth: h.putheader('Authorization', 'Basic %s' % auth)
423
            if realhost: h.putheader('Host', realhost)
424
            for args in self.addheaders: h.putheader(*args)
425
            h.endheaders()
426
            if data is not None:
427
                h.send(data)
428 429
            errcode, errmsg, headers = h.getreply()
            fp = h.getfile()
430 431 432 433 434
            if errcode == -1:
                if fp: fp.close()
                # something went wrong with the HTTP status line
                raise IOError, ('http protocol error', 0,
                                'got a bad status line', None)
435
            if errcode == 200:
436
                return addinfourl(fp, headers, "https:" + url)
437
            else:
438 439 440
                if data is None:
                    return self.http_error(url, fp, errcode, errmsg, headers)
                else:
441 442
                    return self.http_error(url, fp, errcode, errmsg, headers,
                                           data)
443

444
    def open_file(self, url):
445
        """Use local file or FTP depending on form of URL."""
446 447
        if not isinstance(url, str):
            raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
448
        if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
449 450 451 452 453
            return self.open_ftp(url)
        else:
            return self.open_local_file(url)

    def open_local_file(self, url):
454
        """Use local file."""
455
        import mimetypes, mimetools, email.utils
456
        from io import StringIO
457 458
        host, file = splithost(url)
        localname = url2pathname(file)
459 460
        try:
            stats = os.stat(localname)
461
        except OSError as e:
462
            raise IOError(e.errno, e.strerror, e.filename)
463
        size = stats.st_size
464
        modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
465
        mtype = mimetypes.guess_type(url)[0]
466
        headers = mimetools.Message(StringIO(
467 468
            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
            (mtype or 'text/plain', size, modified)))
469
        if not host:
Guido van Rossum's avatar
Guido van Rossum committed
470 471 472
            urlfile = file
            if file[:1] == '/':
                urlfile = 'file://' + file
473
            return addinfourl(open(localname, 'rb'),
Guido van Rossum's avatar
Guido van Rossum committed
474
                              headers, urlfile)
475 476
        host, port = splitport(host)
        if not port \
477
           and socket.gethostbyname(host) in (localhost(), thishost()):
Guido van Rossum's avatar
Guido van Rossum committed
478 479 480
            urlfile = file
            if file[:1] == '/':
                urlfile = 'file://' + file
481
            return addinfourl(open(localname, 'rb'),
Guido van Rossum's avatar
Guido van Rossum committed
482
                              headers, urlfile)
483 484 485
        raise IOError, ('local file error', 'not on local host')

    def open_ftp(self, url):
486
        """Use FTP protocol."""
487 488
        if not isinstance(url, str):
            raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
489
        import mimetypes, mimetools
490
        from io import StringIO
491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
        host, path = splithost(url)
        if not host: raise IOError, ('ftp error', 'no host given')
        host, port = splitport(host)
        user, host = splituser(host)
        if user: user, passwd = splitpasswd(user)
        else: passwd = None
        host = unquote(host)
        user = unquote(user or '')
        passwd = unquote(passwd or '')
        host = socket.gethostbyname(host)
        if not port:
            import ftplib
            port = ftplib.FTP_PORT
        else:
            port = int(port)
        path, attrs = splitattr(path)
        path = unquote(path)
508
        dirs = path.split('/')
509 510
        dirs, file = dirs[:-1], dirs[-1]
        if dirs and not dirs[0]: dirs = dirs[1:]
511
        if dirs and not dirs[0]: dirs[0] = '/'
512
        key = user, host, port, '/'.join(dirs)
513 514 515 516 517 518 519 520 521
        # XXX thread unsafe!
        if len(self.ftpcache) > MAXFTPCACHE:
            # Prune the cache, rather arbitrarily
            for k in self.ftpcache.keys():
                if k != key:
                    v = self.ftpcache[k]
                    del self.ftpcache[k]
                    v.close()
        try:
522
            if not key in self.ftpcache:
523 524 525 526 527 528
                self.ftpcache[key] = \
                    ftpwrapper(user, passwd, host, port, dirs)
            if not file: type = 'D'
            else: type = 'I'
            for attr in attrs:
                attr, value = splitvalue(attr)
529
                if attr.lower() == 'type' and \
530
                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
531
                    type = value.upper()
532
            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
533 534 535 536
            mtype = mimetypes.guess_type("ftp:" + url)[0]
            headers = ""
            if mtype:
                headers += "Content-Type: %s\n" % mtype
537
            if retrlen is not None and retrlen >= 0:
538
                headers += "Content-Length: %d\n" % retrlen
539
            headers = mimetools.Message(StringIO(headers))
540
            return addinfourl(fp, headers, "ftp:" + url)
541
        except ftperrors() as msg:
542 543 544
            raise IOError, ('ftp error', msg), sys.exc_info()[2]

    def open_data(self, url, data=None):
545
        """Use "data" URL."""
546 547
        if not isinstance(url, str):
            raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
548 549 550 551 552 553 554
        # ignore POSTed data
        #
        # syntax of data URLs:
        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
        # mediatype := [ type "/" subtype ] *( ";" parameter )
        # data      := *urlchar
        # parameter := attribute "=" value
555
        import mimetools
556
        from io import StringIO
557
        try:
558
            [type, data] = url.split(',', 1)
559 560 561 562
        except ValueError:
            raise IOError, ('data error', 'bad data URL')
        if not type:
            type = 'text/plain;charset=US-ASCII'
563
        semi = type.rfind(';')
564 565 566 567 568 569 570 571 572 573 574 575 576 577
        if semi >= 0 and '=' not in type[semi:]:
            encoding = type[semi+1:]
            type = type[:semi]
        else:
            encoding = ''
        msg = []
        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
                                            time.gmtime(time.time())))
        msg.append('Content-type: %s' % type)
        if encoding == 'base64':
            import base64
            data = base64.decodestring(data)
        else:
            data = unquote(data)
578
        msg.append('Content-Length: %d' % len(data))
579 580
        msg.append('')
        msg.append(data)
581
        msg = '\n'.join(msg)
582
        f = StringIO(msg)
583
        headers = mimetools.Message(f, 0)
584
        #f.fileno = None     # needed for addinfourl
585
        return addinfourl(f, headers, url)
586

587

588
class FancyURLopener(URLopener):
589
    """Derived class with handlers for errors we can handle (perhaps)."""
590

591
    def __init__(self, *args, **kwargs):
592
        URLopener.__init__(self, *args, **kwargs)
593
        self.auth_cache = {}
594 595
        self.tries = 0
        self.maxtries = 10
596 597

    def http_error_default(self, url, fp, errcode, errmsg, headers):
598
        """Default error handling -- don't raise an exception."""
599 600
        return addinfourl(fp, headers, "http:" + url)

601
    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
602
        """Error 302 -- relocated (temporarily)."""
603 604 605 606 607 608 609 610 611 612 613 614 615 616 617
        self.tries += 1
        if self.maxtries and self.tries >= self.maxtries:
            if hasattr(self, "http_error_500"):
                meth = self.http_error_500
            else:
                meth = self.http_error_default
            self.tries = 0
            return meth(url, fp, 500,
                        "Internal Server Error: Redirect Recursion", headers)
        result = self.redirect_internal(url, fp, errcode, errmsg, headers,
                                        data)
        self.tries = 0
        return result

    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
618
        if 'location' in headers:
619
            newurl = headers['location']
620
        elif 'uri' in headers:
621 622 623 624 625
            newurl = headers['uri']
        else:
            return
        void = fp.read()
        fp.close()
626
        # In case the server sent a relative URL, join with original:
627
        newurl = basejoin(self.type + ":" + url, newurl)
628
        return self.open(newurl)
629

630
    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
631 632
        """Error 301 -- also relocated (permanently)."""
        return self.http_error_302(url, fp, errcode, errmsg, headers, data)
633

634 635 636 637
    def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
        """Error 303 -- also relocated (essentially identical to 302)."""
        return self.http_error_302(url, fp, errcode, errmsg, headers, data)

638 639 640 641 642 643 644
    def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
        """Error 307 -- relocated, but turn POST into error."""
        if data is None:
            return self.http_error_302(url, fp, errcode, errmsg, headers, data)
        else:
            return self.http_error_default(url, fp, errcode, errmsg, headers)

645
    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
646
        """Error 401 -- authentication required.
647
        This function supports Basic authentication only."""
648
        if not 'www-authenticate' in headers:
Tim Peters's avatar
Tim Peters committed
649
            URLopener.http_error_default(self, url, fp,
650
                                         errcode, errmsg, headers)
651 652 653 654
        stuff = headers['www-authenticate']
        import re
        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
        if not match:
Tim Peters's avatar
Tim Peters committed
655
            URLopener.http_error_default(self, url, fp,
656 657 658
                                         errcode, errmsg, headers)
        scheme, realm = match.groups()
        if scheme.lower() != 'basic':
Tim Peters's avatar
Tim Peters committed
659
            URLopener.http_error_default(self, url, fp,
660 661 662 663 664 665
                                         errcode, errmsg, headers)
        name = 'retry_' + self.type + '_basic_auth'
        if data is None:
            return getattr(self,name)(url, realm)
        else:
            return getattr(self,name)(url, realm, data)
Tim Peters's avatar
Tim Peters committed
666

667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687
    def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
        """Error 407 -- proxy authentication required.
        This function supports Basic authentication only."""
        if not 'proxy-authenticate' in headers:
            URLopener.http_error_default(self, url, fp,
                                         errcode, errmsg, headers)
        stuff = headers['proxy-authenticate']
        import re
        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
        if not match:
            URLopener.http_error_default(self, url, fp,
                                         errcode, errmsg, headers)
        scheme, realm = match.groups()
        if scheme.lower() != 'basic':
            URLopener.http_error_default(self, url, fp,
                                         errcode, errmsg, headers)
        name = 'retry_proxy_' + self.type + '_basic_auth'
        if data is None:
            return getattr(self,name)(url, realm)
        else:
            return getattr(self,name)(url, realm, data)
Tim Peters's avatar
Tim Peters committed
688

689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704
    def retry_proxy_http_basic_auth(self, url, realm, data=None):
        host, selector = splithost(url)
        newurl = 'http://' + host + selector
        proxy = self.proxies['http']
        urltype, proxyhost = splittype(proxy)
        proxyhost, proxyselector = splithost(proxyhost)
        i = proxyhost.find('@') + 1
        proxyhost = proxyhost[i:]
        user, passwd = self.get_user_passwd(proxyhost, realm, i)
        if not (user or passwd): return None
        proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
        self.proxies['http'] = 'http://' + proxyhost + proxyselector
        if data is None:
            return self.open(newurl)
        else:
            return self.open(newurl, data)
705

706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721
    def retry_proxy_https_basic_auth(self, url, realm, data=None):
        host, selector = splithost(url)
        newurl = 'https://' + host + selector
        proxy = self.proxies['https']
        urltype, proxyhost = splittype(proxy)
        proxyhost, proxyselector = splithost(proxyhost)
        i = proxyhost.find('@') + 1
        proxyhost = proxyhost[i:]
        user, passwd = self.get_user_passwd(proxyhost, realm, i)
        if not (user or passwd): return None
        proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
        self.proxies['https'] = 'https://' + proxyhost + proxyselector
        if data is None:
            return self.open(newurl)
        else:
            return self.open(newurl, data)
Tim Peters's avatar
Tim Peters committed
722

Guido van Rossum's avatar
Guido van Rossum committed
723
    def retry_http_basic_auth(self, url, realm, data=None):
724
        host, selector = splithost(url)
725
        i = host.find('@') + 1
726 727 728
        host = host[i:]
        user, passwd = self.get_user_passwd(host, realm, i)
        if not (user or passwd): return None
729
        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
730
        newurl = 'http://' + host + selector
Guido van Rossum's avatar
Guido van Rossum committed
731 732 733 734
        if data is None:
            return self.open(newurl)
        else:
            return self.open(newurl, data)
735

Guido van Rossum's avatar
Guido van Rossum committed
736
    def retry_https_basic_auth(self, url, realm, data=None):
737 738 739 740 741
        host, selector = splithost(url)
        i = host.find('@') + 1
        host = host[i:]
        user, passwd = self.get_user_passwd(host, realm, i)
        if not (user or passwd): return None
742
        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
743 744 745 746 747
        newurl = 'https://' + host + selector
        if data is None:
            return self.open(newurl)
        else:
            return self.open(newurl, data)
748 749

    def get_user_passwd(self, host, realm, clear_cache = 0):
750
        key = realm + '@' + host.lower()
751
        if key in self.auth_cache:
752 753 754 755 756 757 758 759 760
            if clear_cache:
                del self.auth_cache[key]
            else:
                return self.auth_cache[key]
        user, passwd = self.prompt_user_passwd(host, realm)
        if user or passwd: self.auth_cache[key] = (user, passwd)
        return user, passwd

    def prompt_user_passwd(self, host, realm):
761
        """Override this in a GUI environment!"""
762
        import getpass
763
        try:
764
            user = input("Enter username for %s at %s: " % (realm, host))
765 766 767 768
            passwd = getpass.getpass("Enter password for %s in %s at %s: " %
                (user, realm, host))
            return user, passwd
        except KeyboardInterrupt:
769
            print()
770
            return None, None
771 772


773 774 775 776
# Utility functions

_localhost = None
def localhost():
777
    """Return the IP address of the magic hostname 'localhost'."""
778
    global _localhost
779
    if _localhost is None:
780 781
        _localhost = socket.gethostbyname('localhost')
    return _localhost
782 783 784

_thishost = None
def thishost():
785
    """Return the IP address of the current host."""
786
    global _thishost
787
    if _thishost is None:
788 789
        _thishost = socket.gethostbyname(socket.gethostname())
    return _thishost
790 791 792

_ftperrors = None
def ftperrors():
793
    """Return the set of errors raised by the FTP class."""
794
    global _ftperrors
795
    if _ftperrors is None:
796 797 798
        import ftplib
        _ftperrors = ftplib.all_errors
    return _ftperrors
799 800 801

_noheaders = None
def noheaders():
802
    """Return an empty mimetools.Message object."""
803
    global _noheaders
804
    if _noheaders is None:
805
        import mimetools
806
        from io import StringIO
807
        _noheaders = mimetools.Message(StringIO(), 0)
808 809
        _noheaders.fp.close()   # Recycle file descriptor
    return _noheaders
810 811 812 813 814


# Utility classes

class ftpwrapper:
815 816
    """Class used by open_ftp() for cache of open FTP connections."""

817
    def __init__(self, user, passwd, host, port, dirs, timeout=None):
818 819 820 821 822
        self.user = user
        self.passwd = passwd
        self.host = host
        self.port = port
        self.dirs = dirs
823
        self.timeout = timeout
824
        self.init()
825

826 827 828 829
    def init(self):
        import ftplib
        self.busy = 0
        self.ftp = ftplib.FTP()
830
        self.ftp.connect(self.host, self.port, self.timeout)
831 832 833
        self.ftp.login(self.user, self.passwd)
        for dir in self.dirs:
            self.ftp.cwd(dir)
834

835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850
    def retrfile(self, file, type):
        import ftplib
        self.endtransfer()
        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
        else: cmd = 'TYPE ' + type; isdir = 0
        try:
            self.ftp.voidcmd(cmd)
        except ftplib.all_errors:
            self.init()
            self.ftp.voidcmd(cmd)
        conn = None
        if file and not isdir:
            # Try to retrieve as a file
            try:
                cmd = 'RETR ' + file
                conn = self.ftp.ntransfercmd(cmd)
851
            except ftplib.error_perm as reason:
852
                if str(reason)[:3] != '550':
853 854 855 856 857 858 859 860 861 862 863
                    raise IOError, ('ftp error', reason), sys.exc_info()[2]
        if not conn:
            # Set transfer mode to ASCII!
            self.ftp.voidcmd('TYPE A')
            # Try a directory listing
            if file: cmd = 'LIST ' + file
            else: cmd = 'LIST'
            conn = self.ftp.ntransfercmd(cmd)
        self.busy = 1
        # Pass back both a suitably decorated object and a retrieval length
        return (addclosehook(conn[0].makefile('rb'),
864
                             self.endtransfer), conn[1])
865 866 867 868 869 870 871 872
    def endtransfer(self):
        if not self.busy:
            return
        self.busy = 0
        try:
            self.ftp.voidresp()
        except ftperrors():
            pass
873

874 875 876 877 878 879
    def close(self):
        self.endtransfer()
        try:
            self.ftp.close()
        except ftperrors():
            pass
880 881

class addbase:
882
    """Base class for addinfo and addclosehook."""
883 884

    # XXX Add a method to expose the timeout on the underlying socket?
885

886 887 888 889
    def __init__(self, fp):
        self.fp = fp
        self.read = self.fp.read
        self.readline = self.fp.readline
890
        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
891 892 893 894
        if hasattr(self.fp, "fileno"):
            self.fileno = self.fp.fileno
        else:
            self.fileno = lambda: None
895 896
        if hasattr(self.fp, "__iter__"):
            self.__iter__ = self.fp.__iter__
897 898
            if hasattr(self.fp, "__next__"):
                self.__next__ = self.fp.__next__
899

900
    def __repr__(self):
901 902
        return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
                                             id(self), self.fp)
903

904 905 906 907 908 909 910
    def close(self):
        self.read = None
        self.readline = None
        self.readlines = None
        self.fileno = None
        if self.fp: self.fp.close()
        self.fp = None
911 912

class addclosehook(addbase):
913 914
    """Class to add a close hook to an open file."""

915 916 917 918
    def __init__(self, fp, closehook, *hookargs):
        addbase.__init__(self, fp)
        self.closehook = closehook
        self.hookargs = hookargs
919

920
    def close(self):
921
        addbase.close(self)
922
        if self.closehook:
923
            self.closehook(*self.hookargs)
924 925
            self.closehook = None
            self.hookargs = None
926 927

class addinfo(addbase):
928 929
    """class to add an info() method to an open file."""

930 931 932
    def __init__(self, fp, headers):
        addbase.__init__(self, fp)
        self.headers = headers
933

934 935
    def info(self):
        return self.headers
936

937
class addinfourl(addbase):
938 939
    """class to add info() and geturl() methods to an open file."""

940 941 942 943
    def __init__(self, fp, headers, url):
        addbase.__init__(self, fp)
        self.headers = headers
        self.url = url
944

945 946
    def info(self):
        return self.headers
947

948 949
    def geturl(self):
        return self.url
950

951

952
# Utilities to parse URLs (most of these return None for missing parts):
953
# unwrap('<URL:type://host/path>') --> 'type://host/path'
954 955
# splittype('type:opaquestring') --> 'type', 'opaquestring'
# splithost('//host[:port]/path') --> 'host[:port]', '/path'
956 957
# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
# splitpasswd('user:passwd') -> 'user', 'passwd'
958 959 960
# splitport('host:port') --> 'host', 'port'
# splitquery('/path?query') --> '/path', 'query'
# splittag('/path#tag') --> '/path', 'tag'
961 962 963
# splitattr('/path;attr1=value1;attr2=value2;...') ->
#   '/path', ['attr1=value1', 'attr2=value2', ...]
# splitvalue('attr=value') --> 'attr', 'value'
964 965 966
# unquote('abc%20def') -> 'abc def'
# quote('abc def') -> 'abc%20def')

967
try:
968
    str
969
except NameError:
970
    def _is_unicode(x):
971
        return 0
972 973
else:
    def _is_unicode(x):
974
        return isinstance(x, str)
975

976 977 978 979
def toBytes(url):
    """toBytes(u"URL") --> 'URL'."""
    # Most URL schemes require ASCII. If that changes, the conversion
    # can be relaxed
980
    if _is_unicode(url):
981 982 983
        try:
            url = url.encode("ASCII")
        except UnicodeError:
984 985
            raise UnicodeError("URL " + repr(url) +
                               " contains non-ASCII characters")
986 987
    return url

988
def unwrap(url):
989
    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
990
    url = str(url).strip()
991
    if url[:1] == '<' and url[-1:] == '>':
992 993
        url = url[1:-1].strip()
    if url[:4] == 'URL:': url = url[4:].strip()
994
    return url
995

996
_typeprog = None
997
def splittype(url):
998
    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
999 1000 1001 1002
    global _typeprog
    if _typeprog is None:
        import re
        _typeprog = re.compile('^([^/:]+):')
1003

1004 1005 1006
    match = _typeprog.match(url)
    if match:
        scheme = match.group(1)
1007
        return scheme.lower(), url[len(scheme) + 1:]
1008
    return None, url
1009

1010
_hostprog = None
1011
def splithost(url):
1012
    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1013 1014 1015
    global _hostprog
    if _hostprog is None:
        import re
1016
        _hostprog = re.compile('^//([^/?]*)(.*)$')
1017

1018
    match = _hostprog.match(url)
1019 1020
    if match: return match.group(1, 2)
    return None, url
1021

1022
_userprog = None
1023
def splituser(host):
1024
    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1025 1026 1027
    global _userprog
    if _userprog is None:
        import re
1028
        _userprog = re.compile('^(.*)@(.*)$')
1029

1030
    match = _userprog.match(host)
1031
    if match: return map(unquote, match.group(1, 2))
1032
    return None, host
1033

1034
_passwdprog = None
1035
def splitpasswd(user):
1036
    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1037 1038 1039 1040
    global _passwdprog
    if _passwdprog is None:
        import re
        _passwdprog = re.compile('^([^:]*):(.*)$')
1041

1042 1043 1044
    match = _passwdprog.match(user)
    if match: return match.group(1, 2)
    return user, None
1045

1046
# splittag('/path#tag') --> '/path', 'tag'
1047
_portprog = None
1048
def splitport(host):
1049
    """splitport('host:port') --> 'host', 'port'."""
1050 1051 1052 1053
    global _portprog
    if _portprog is None:
        import re
        _portprog = re.compile('^(.*):([0-9]+)$')
1054

1055 1056 1057
    match = _portprog.match(host)
    if match: return match.group(1, 2)
    return host, None
1058

1059
_nportprog = None
1060
def splitnport(host, defport=-1):
1061 1062 1063 1064
    """Split host and port, returning numeric port.
    Return given default port if no ':' found; defaults to -1.
    Return numerical port if a valid number are found after ':'.
    Return None if ':' but not a valid number."""
1065 1066 1067 1068 1069 1070 1071 1072 1073
    global _nportprog
    if _nportprog is None:
        import re
        _nportprog = re.compile('^(.*):(.*)$')

    match = _nportprog.match(host)
    if match:
        host, port = match.group(1, 2)
        try:
1074 1075 1076
            if not port: raise ValueError, "no digits"
            nport = int(port)
        except ValueError:
1077 1078 1079
            nport = None
        return host, nport
    return host, defport
1080

1081
_queryprog = None
1082
def splitquery(url):
1083
    """splitquery('/path?query') --> '/path', 'query'."""
1084 1085 1086 1087
    global _queryprog
    if _queryprog is None:
        import re
        _queryprog = re.compile('^(.*)\?([^?]*)$')
1088

1089 1090 1091
    match = _queryprog.match(url)
    if match: return match.group(1, 2)
    return url, None
1092

1093
_tagprog = None
1094
def splittag(url):
1095
    """splittag('/path#tag') --> '/path', 'tag'."""
1096 1097 1098 1099
    global _tagprog
    if _tagprog is None:
        import re
        _tagprog = re.compile('^(.*)#([^#]*)$')
1100

1101 1102 1103
    match = _tagprog.match(url)
    if match: return match.group(1, 2)
    return url, None
1104

1105
def splitattr(url):
1106 1107
    """splitattr('/path;attr1=value1;attr2=value2;...') ->
        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1108
    words = url.split(';')
1109
    return words[0], words[1:]
1110

1111
_valueprog = None
1112
def splitvalue(attr):
1113
    """splitvalue('attr=value') --> 'attr', 'value'."""
1114 1115 1116 1117
    global _valueprog
    if _valueprog is None:
        import re
        _valueprog = re.compile('^([^=]*)=(.*)$')
1118

1119 1120 1121
    match = _valueprog.match(attr)
    if match: return match.group(1, 2)
    return attr, None
1122

1123 1124 1125
_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
_hextochr.update(('%02X' % i, chr(i)) for i in range(256))

1126
def unquote(s):
1127
    """unquote('abc%20def') -> 'abc def'."""
1128
    res = s.split('%')
1129
    for i in range(1, len(res)):
1130 1131 1132 1133 1134
        item = res[i]
        try:
            res[i] = _hextochr[item[:2]] + item[2:]
        except KeyError:
            res[i] = '%' + item
1135
        except UnicodeDecodeError:
1136
            res[i] = chr(int(item[:2], 16)) + item[2:]
1137
    return "".join(res)
1138

1139
def unquote_plus(s):
1140
    """unquote('%7e/abc+def') -> '~/abc def'"""
1141
    s = s.replace('+', ' ')
1142
    return unquote(s)
1143

1144
always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
Jeremy Hylton's avatar
Jeremy Hylton committed
1145
               'abcdefghijklmnopqrstuvwxyz'
1146
               '0123456789' '_.-')
1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163
_safe_quoters= {}

class Quoter:
    def __init__(self, safe):
        self.cache = {}
        self.safe = safe + always_safe

    def __call__(self, c):
        try:
            return self.cache[c]
        except KeyError:
            if ord(c) < 256:
                res = (c in self.safe) and c or ('%%%02X' % ord(c))
                self.cache[c] = res
                return res
            else:
                return "".join(['%%%02X' % i for i in c.encode("utf-8")])
1164

1165
def quote(s, safe = '/'):
1166
    """quote('abc def') -> 'abc%20def'
1167

1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185
    Each part of a URL, e.g. the path info, the query, etc., has a
    different set of reserved characters that must be quoted.

    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
    the following reserved characters.

    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
                  "$" | ","

    Each of these characters is reserved in some component of a URL,
    but not necessarily in all of them.

    By default, the quote function is intended for quoting the path
    section of a URL.  Thus, it will not encode '/'.  This character
    is reserved, but in typical usage the quote function is being
    called on a path where the existing slash characters are used as
    reserved characters.
    """
1186 1187
    cachekey = (safe, always_safe)
    try:
1188
        quoter = _safe_quoters[cachekey]
1189
    except KeyError:
1190 1191 1192
        quoter = Quoter(safe)
        _safe_quoters[cachekey] = quoter
    res = map(quoter, s)
1193
    return ''.join(res)
1194

1195 1196
def quote_plus(s, safe = ''):
    """Quote the query fragment of a URL; replacing ' ' with '+'"""
1197
    if ' ' in s:
1198 1199 1200
        s = quote(s, safe + ' ')
        return s.replace(' ', '+')
    return quote(s, safe)
1201

1202 1203
def urlencode(query,doseq=0):
    """Encode a sequence of two-element tuples or dictionary into a URL query string.
1204

1205
    If any values in the query arg are sequences and doseq is true, each
1206
    sequence element is converted to a separate parameter.
1207 1208 1209 1210

    If the query arg is a sequence of two-element tuples, the order of the
    parameters in the output will match the order of parameters in the
    input.
1211
    """
Tim Peters's avatar
Tim Peters committed
1212

1213 1214 1215 1216 1217 1218 1219 1220 1221
    if hasattr(query,"items"):
        # mapping objects
        query = query.items()
    else:
        # it's a bother at times that strings and string-like objects are
        # sequences...
        try:
            # non-sequence items should not work with len()
            # non-empty strings will fail this
1222
            if len(query) and not isinstance(query[0], tuple):
1223 1224 1225 1226 1227 1228 1229 1230 1231
                raise TypeError
            # zero-length sequences of all types will get here and succeed,
            # but that's a minor nit - since the original implementation
            # allowed empty dicts that type of behavior probably should be
            # preserved for consistency
        except TypeError:
            ty,va,tb = sys.exc_info()
            raise TypeError, "not a valid non-string sequence or mapping object", tb

1232
    l = []
1233 1234
    if not doseq:
        # preserve old behavior
1235
        for k, v in query:
1236 1237 1238 1239
            k = quote_plus(str(k))
            v = quote_plus(str(v))
            l.append(k + '=' + v)
    else:
1240
        for k, v in query:
1241
            k = quote_plus(str(k))
1242
            if isinstance(v, str):
1243 1244
                v = quote_plus(v)
                l.append(k + '=' + v)
1245
            elif _is_unicode(v):
1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262
                # is there a reasonable way to convert to ASCII?
                # encode generates a string, but "replace" or "ignore"
                # lose information and "strict" can raise UnicodeError
                v = quote_plus(v.encode("ASCII","replace"))
                l.append(k + '=' + v)
            else:
                try:
                    # is this a sufficient test for sequence-ness?
                    x = len(v)
                except TypeError:
                    # not a sequence
                    v = quote_plus(str(v))
                    l.append(k + '=' + v)
                else:
                    # loop over the sequence
                    for elt in v:
                        l.append(k + '=' + quote_plus(str(elt)))
1263
    return '&'.join(l)
1264

1265
# Proxy handling
1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276
def getproxies_environment():
    """Return a dictionary of scheme -> proxy server URL mappings.

    Scan the environment for variables named <scheme>_proxy;
    this seems to be the standard convention.  If you need a
    different way, you can pass a proxies dictionary to the
    [Fancy]URLopener constructor.

    """
    proxies = {}
    for name, value in os.environ.items():
1277
        name = name.lower()
1278 1279 1280 1281
        if value and name[-6:] == '_proxy':
            proxies[name[:-6]] = value
    return proxies

1282 1283
if sys.platform == 'darwin':
    def getproxies_internetconfig():
1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294
        """Return a dictionary of scheme -> proxy server URL mappings.

        By convention the mac uses Internet Config to store
        proxies.  An HTTP proxy, for instance, is stored under
        the HttpProxy key.

        """
        try:
            import ic
        except ImportError:
            return {}
1295

1296 1297 1298 1299 1300 1301
        try:
            config = ic.IC()
        except ic.error:
            return {}
        proxies = {}
        # HTTP:
1302
        if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
1303 1304 1305 1306 1307 1308 1309 1310 1311
            try:
                value = config['HTTPProxyHost']
            except ic.error:
                pass
            else:
                proxies['http'] = 'http://%s' % value
        # FTP: XXXX To be done.
        # Gopher: XXXX To be done.
        return proxies
1312

1313 1314 1315
    def proxy_bypass(x):
        return 0

1316 1317
    def getproxies():
        return getproxies_environment() or getproxies_internetconfig()
1318

1319 1320
elif os.name == 'nt':
    def getproxies_registry():
1321
        """Return a dictionary of scheme -> proxy server URL mappings.
1322 1323 1324

        Win32 uses the registry to store proxies.

1325 1326
        """
        proxies = {}
1327 1328 1329 1330 1331 1332
        try:
            import _winreg
        except ImportError:
            # Std module, so should be around - but you never know!
            return proxies
        try:
1333 1334
            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1335 1336 1337 1338 1339 1340
            proxyEnable = _winreg.QueryValueEx(internetSettings,
                                               'ProxyEnable')[0]
            if proxyEnable:
                # Returned as Unicode but problems if not converted to ASCII
                proxyServer = str(_winreg.QueryValueEx(internetSettings,
                                                       'ProxyServer')[0])
1341 1342
                if '=' in proxyServer:
                    # Per-protocol settings
1343
                    for p in proxyServer.split(';'):
1344
                        protocol, address = p.split('=', 1)
1345
                        # See if address has a type:// prefix
1346 1347
                        import re
                        if not re.match('^([^/:]+)://', address):
1348 1349
                            address = '%s://%s' % (protocol, address)
                        proxies[protocol] = address
1350 1351 1352 1353 1354 1355 1356
                else:
                    # Use one setting for all protocols
                    if proxyServer[:5] == 'http:':
                        proxies['http'] = proxyServer
                    else:
                        proxies['http'] = 'http://%s' % proxyServer
                        proxies['ftp'] = 'ftp://%s' % proxyServer
1357 1358 1359 1360 1361 1362
            internetSettings.Close()
        except (WindowsError, ValueError, TypeError):
            # Either registry key not found etc, or the value in an
            # unexpected format.
            # proxies already set up to be empty so nothing to do
            pass
1363
        return proxies
1364

1365 1366 1367 1368 1369 1370 1371 1372
    def getproxies():
        """Return a dictionary of scheme -> proxy server URL mappings.

        Returns settings gathered from the environment, if specified,
        or the registry.

        """
        return getproxies_environment() or getproxies_registry()
1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393

    def proxy_bypass(host):
        try:
            import _winreg
            import re
        except ImportError:
            # Std modules, so should be around - but you never know!
            return 0
        try:
            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
            proxyEnable = _winreg.QueryValueEx(internetSettings,
                                               'ProxyEnable')[0]
            proxyOverride = str(_winreg.QueryValueEx(internetSettings,
                                                     'ProxyOverride')[0])
            # ^^^^ Returned as Unicode but problems if not converted to ASCII
        except WindowsError:
            return 0
        if not proxyEnable or not proxyOverride:
            return 0
        # try to make a host list from name and IP address.
1394 1395
        rawHost, port = splitport(host)
        host = [rawHost]
1396
        try:
1397 1398
            addr = socket.gethostbyname(rawHost)
            if addr != rawHost:
1399 1400 1401
                host.append(addr)
        except socket.error:
            pass
1402 1403 1404 1405 1406 1407
        try:
            fqdn = socket.getfqdn(rawHost)
            if fqdn != rawHost:
                host.append(fqdn)
        except socket.error:
            pass
1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423
        # make a check value list from the registry entry: replace the
        # '<local>' string by the localhost entry and the corresponding
        # canonical entry.
        proxyOverride = proxyOverride.split(';')
        i = 0
        while i < len(proxyOverride):
            if proxyOverride[i] == '<local>':
                proxyOverride[i:i+1] = ['localhost',
                                        '127.0.0.1',
                                        socket.gethostname(),
                                        socket.gethostbyname(
                                            socket.gethostname())]
            i += 1
        # print proxyOverride
        # now check if we match one of the registry values.
        for test in proxyOverride:
Tim Peters's avatar
Tim Peters committed
1424 1425 1426
            test = test.replace(".", r"\.")     # mask dots
            test = test.replace("*", r".*")     # change glob sequence
            test = test.replace("?", r".")      # change glob char
1427 1428 1429 1430 1431 1432
            for val in host:
                # print "%s <--> %s" %( test, val )
                if re.match(test, val, re.I):
                    return 1
        return 0

1433 1434 1435 1436
else:
    # By default use environment variables
    getproxies = getproxies_environment

1437 1438
    def proxy_bypass(host):
        return 0
1439

1440 1441
# Test and time quote() and unquote()
def test1():
1442 1443 1444 1445 1446 1447 1448 1449
    s = ''
    for i in range(256): s = s + chr(i)
    s = s*4
    t0 = time.time()
    qs = quote(s)
    uqs = unquote(qs)
    t1 = time.time()
    if uqs != s:
1450 1451 1452 1453 1454
        print('Wrong!')
    print(repr(s))
    print(repr(qs))
    print(repr(uqs))
    print(round(t1 - t0, 3), 'sec')
1455 1456


Guido van Rossum's avatar
Guido van Rossum committed
1457 1458
def reporthook(blocknum, blocksize, totalsize):
    # Report during remote transfers
1459 1460
    print("Block number: %d, Block size: %d, Total size: %d" % (
        blocknum, blocksize, totalsize))
Guido van Rossum's avatar
Guido van Rossum committed
1461

1462
# Test program
1463
def test(args=[]):
1464
    import string
1465 1466 1467 1468 1469
    if not args:
        args = [
            '/etc/passwd',
            'file:/etc/passwd',
            'file://localhost/etc/passwd',
1470
            'ftp://ftp.gnu.org/pub/README',
1471 1472
            'http://www.python.org/index.html',
            ]
1473 1474
        if hasattr(URLopener, "open_https"):
            args.append('https://synergy.as.cmu.edu/~geek/')
1475 1476
    try:
        for url in args:
1477
            print('-'*10, url, '-'*10)
1478
            fn, h = urlretrieve(url, None, reporthook)
1479
            print(fn)
1480
            if h:
1481 1482 1483
                print('======')
                for k in h.keys(): print(k + ':', h[k])
                print('======')
1484 1485 1486 1487 1488
            fp = open(fn, 'rb')
            data = fp.read()
            del fp
            if '\r' in data:
                table = string.maketrans("", "")
1489
                data = data.translate(table, "\r")
1490
            print(data)
1491
            fn, h = None, None
1492
        print('-'*40)
1493 1494
    finally:
        urlcleanup()
1495

1496
def main():
1497 1498 1499
    import getopt, sys
    try:
        opts, args = getopt.getopt(sys.argv[1:], "th")
1500
    except getopt.error as msg:
1501 1502
        print(msg)
        print("Use -h for help")
1503 1504 1505 1506 1507 1508
        return
    t = 0
    for o, a in opts:
        if o == '-t':
            t = t + 1
        if o == '-h':
1509 1510 1511
            print("Usage: python urllib.py [-t] [url ...]")
            print("-t runs self-test;", end=' ')
            print("otherwise, contents of urls are printed")
1512 1513 1514 1515 1516 1517 1518
            return
    if t:
        if t > 1:
            test1()
        test(args)
    else:
        if not args:
1519
            print("Use -h for help")
1520
        for url in args:
1521
            print(urlopen(url).read(), end=' ')
1522

1523 1524
# Run test program when run as a script
if __name__ == '__main__':
1525
    main()