Patch [ 1062060 ] fix for 1016880 urllib.urlretrieve silently truncates dwnld

b925602f · Georg Brandl · 56897318 · b925602f · b925602f · b925602f
Commit b925602f authored Aug 24, 2005 by Georg Brandl
Hide whitespace changes
Inline Side-by-side

Showing with 48 additions and 2 deletions

Doc/lib/liburllib.tex Doc/lib/liburllib.tex +29 -1

Lib/urllib.py Lib/urllib.py +15 -1

Misc/NEWS Misc/NEWS +4 -0

No files found.
--- a/Doc/lib/liburllib.tex
+++ b/Doc/lib/liburllib.tex
@@ -142,6 +142,25 @@ If the \var{url} uses the \file{http:} scheme identifier, the optional
 (normally the request type is \code{GET}).  The \var{data} argument
 must in standard \mimetype{application/x-www-form-urlencoded} format;
 see the \function{urlencode()} function below.
+
+\versionchanged[
+\function{urlretrieve()} will raise \exception{ContentTooShortError}
+when it detects that the amount of data available 
+was less than the expected amount (which is the size reported by a 
+\var{Content-Length} header). This can occur, for example, when the 
+download is interrupted.
+
+The \var{Content-Length} is treated as a lower bound: if there's more data 
+to read, urlretrieve reads more data, but if less data is available, 
+it raises the exception.
+
+You can still retrieve the downloaded data in this case, it is stored 
+in the \member{content} attribute of the exception instance.
+
+If no \var{Content-Length} header was supplied, urlretrieve can
+not check the size of the data it has downloaded, and just returns it. 
+In this case you just have to assume that the download was successful]{2.5}
+
 \end{funcdesc}

 \begin{datadesc}{_urlopener}
@@ -283,6 +302,15 @@ subclass may override this method to support more appropriate behavior
 if needed.}
 \end{classdesc}

+\begin{excclassdesc}{ContentTooShortError}{msg\optional{, content}}
+This exception is raised when the \function{urlretrieve()} function
+detects that the amount of the downloaded data is less than the 
+expected amount (given by the \var{Content-Length} header). The
+\member{content} attribute stores the downloaded (and supposedly
+truncated) data.
+\versionadded{2.5}
+\end{excclassdesc}
+
 Restrictions:

 \begin{itemize}
@@ -317,7 +345,7 @@ Web client using these functions without using threads.
 \item
 The data returned by \function{urlopen()} or \function{urlretrieve()}
 is the raw data returned by the server.  This may be binary data
-(e.g. an image), plain text or (for example) HTML\index{HTML}.  The
+(such as an image), plain text or (for example) HTML\index{HTML}.  The
 HTTP\indexii{HTTP}{protocol} protocol provides type information in the
 reply header, which can be inspected by looking at the
 \mailheader{Content-Type} header.  For the

--- a/Lib/urllib.py
+++ b/Lib/urllib.py
@@ -86,6 +86,11 @@ def urlcleanup():
    if _urlopener:
        _urlopener.cleanup()

+# exception raised when downloaded size does not match content-length
+class ContentTooShortError(IOError):
+    def __init__(self, message, content):
+        IOError.__init__(self, message)
+        self.content = content

 ftpcache = {}
 class URLopener:
@@ -228,24 +233,33 @@ class URLopener:
            self.tempcache[url] = result
        bs = 1024*8
        size = -1
+        read = 0
        blocknum = 1
        if reporthook:
            if "content-length" in headers:
                size = int(headers["Content-Length"])
            reporthook(0, bs, size)
        block = fp.read(bs)
+        read += len(block)
        if reporthook:
            reporthook(1, bs, size)
        while block:
            tfp.write(block)
            block = fp.read(bs)
-            blocknum = blocknum + 1
+            read += len(block)
+            blocknum += 1
            if reporthook:
                reporthook(blocknum, bs, size)
        fp.close()
        tfp.close()
        del fp
        del tfp
+
+        # raise exception if actual size does not match content-length header
+        if size >= 0 and read < size:
+            raise ContentTooShortError("retrieval incomplete: got only %i out "
+                                       "of %i bytes" % (read, size), result)
+
        return result

    # Each method named open_<type> knows how to open that type of URL

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -193,6 +193,10 @@ Extension Modules
 Library
 -------

+- Patch #1062060: urllib.urlretrieve() now raises a new exception, named
+  ContentTooShortException, when the actually downloaded size does not
+  match the Content-Length header.
+
 - Bug #1121494: distutils.dir_utils.mkpath now accepts Unicode strings.

 - Bug #1178484: Return complete lines from codec stream readers