Commit 19c46a4c authored by Giampaolo Rodola's avatar Giampaolo Rodola Committed by GitHub

bpo-33695 shutil.copytree() + os.scandir() cache (#7874)

parent cd449806
......@@ -277,6 +277,14 @@ Optimizations
See :ref:`shutil-platform-dependent-efficient-copy-operations` section.
(Contributed by Giampaolo Rodola' in :issue:`25427`.)
* :func:`shutil.copytree` uses :func:`os.scandir` function and all copy
functions depending from it use cached :func:`os.stat` values. The speedup
for copying a directory with 8000 files is around +9% on Linux, +20% on
Windows and +30% on a Windows SMB share. Also the number of :func:`os.stat`
syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
on network filesystems. (Contributed by Giampaolo Rodola' in :issue:`33695`.)
* The default protocol in the :mod:`pickle` module is now Protocol 4,
first introduced in Python 3.4. It offers better performance and smaller
size compared to Protocol 3 available since Python 3.0.
......
......@@ -200,6 +200,12 @@ def copyfileobj(fsrc, fdst, length=COPY_BUFSIZE):
def _samefile(src, dst):
# Macintosh, Unix.
if isinstance(src, os.DirEntry) and hasattr(os.path, 'samestat'):
try:
return os.path.samestat(src.stat(), os.stat(dst))
except OSError:
return False
if hasattr(os.path, 'samefile'):
try:
return os.path.samefile(src, dst)
......@@ -210,6 +216,12 @@ def _samefile(src, dst):
return (os.path.normcase(os.path.abspath(src)) ==
os.path.normcase(os.path.abspath(dst)))
def _stat(fn):
return fn.stat() if isinstance(fn, os.DirEntry) else os.stat(fn)
def _islink(fn):
return fn.is_symlink() if isinstance(fn, os.DirEntry) else os.path.islink(fn)
def copyfile(src, dst, *, follow_symlinks=True):
"""Copy data from src to dst in the most efficient way possible.
......@@ -223,18 +235,19 @@ def copyfile(src, dst, *, follow_symlinks=True):
file_size = 0
for i, fn in enumerate([src, dst]):
try:
st = os.stat(fn)
st = _stat(fn)
except OSError:
# File most likely does not exist
pass
else:
# XXX What about other special files? (sockets, devices...)
if stat.S_ISFIFO(st.st_mode):
fn = fn.path if isinstance(fn, os.DirEntry) else fn
raise SpecialFileError("`%s` is a named pipe" % fn)
if _WINDOWS and i == 0:
file_size = st.st_size
if not follow_symlinks and os.path.islink(src):
if not follow_symlinks and _islink(src):
os.symlink(os.readlink(src), dst)
else:
with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst:
......@@ -270,13 +283,13 @@ def copymode(src, dst, *, follow_symlinks=True):
(e.g. Linux) this method does nothing.
"""
if not follow_symlinks and os.path.islink(src) and os.path.islink(dst):
if not follow_symlinks and _islink(src) and os.path.islink(dst):
if hasattr(os, 'lchmod'):
stat_func, chmod_func = os.lstat, os.lchmod
else:
return
elif hasattr(os, 'chmod'):
stat_func, chmod_func = os.stat, os.chmod
stat_func, chmod_func = _stat, os.chmod
else:
return
......@@ -325,7 +338,7 @@ def copystat(src, dst, *, follow_symlinks=True):
pass
# follow symlinks (aka don't not follow symlinks)
follow = follow_symlinks or not (os.path.islink(src) and os.path.islink(dst))
follow = follow_symlinks or not (_islink(src) and os.path.islink(dst))
if follow:
# use the real function if it exists
def lookup(name):
......@@ -339,7 +352,10 @@ def copystat(src, dst, *, follow_symlinks=True):
return fn
return _nop
st = lookup("stat")(src, follow_symlinks=follow)
if isinstance(src, os.DirEntry):
st = src.stat(follow_symlinks=follow)
else:
st = lookup("stat")(src, follow_symlinks=follow)
mode = stat.S_IMODE(st.st_mode)
lookup("utime")(dst, ns=(st.st_atime_ns, st.st_mtime_ns),
follow_symlinks=follow)
......@@ -415,79 +431,47 @@ def ignore_patterns(*patterns):
return set(ignored_names)
return _ignore_patterns
def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
ignore_dangling_symlinks=False):
"""Recursively copy a directory tree.
The destination directory must not already exist.
If exception(s) occur, an Error is raised with a list of reasons.
If the optional symlinks flag is true, symbolic links in the
source tree result in symbolic links in the destination tree; if
it is false, the contents of the files pointed to by symbolic
links are copied. If the file pointed by the symlink doesn't
exist, an exception will be added in the list of errors raised in
an Error exception at the end of the copy process.
You can set the optional ignore_dangling_symlinks flag to true if you
want to silence this exception. Notice that this has no effect on
platforms that don't support os.symlink.
The optional ignore argument is a callable. If given, it
is called with the `src` parameter, which is the directory
being visited by copytree(), and `names` which is the list of
`src` contents, as returned by os.listdir():
callable(src, names) -> ignored_names
Since copytree() is called recursively, the callable will be
called once for each directory that is copied. It returns a
list of names relative to the `src` directory that should
not be copied.
The optional copy_function argument is a callable that will be used
to copy each file. It will be called with the source path and the
destination path as arguments. By default, copy2() is used, but any
function that supports the same signature (like copy()) can be used.
"""
names = os.listdir(src)
def _copytree(entries, src, dst, symlinks, ignore, copy_function,
ignore_dangling_symlinks):
if ignore is not None:
ignored_names = ignore(src, names)
ignored_names = ignore(src, set(os.listdir(src)))
else:
ignored_names = set()
os.makedirs(dst)
errors = []
for name in names:
if name in ignored_names:
use_srcentry = copy_function is copy2 or copy_function is copy
for srcentry in entries:
if srcentry.name in ignored_names:
continue
srcname = os.path.join(src, name)
dstname = os.path.join(dst, name)
srcname = os.path.join(src, srcentry.name)
dstname = os.path.join(dst, srcentry.name)
srcobj = srcentry if use_srcentry else srcname
try:
if os.path.islink(srcname):
if srcentry.is_symlink():
linkto = os.readlink(srcname)
if symlinks:
# We can't just leave it to `copy_function` because legacy
# code with a custom `copy_function` may rely on copytree
# doing the right thing.
os.symlink(linkto, dstname)
copystat(srcname, dstname, follow_symlinks=not symlinks)
copystat(srcobj, dstname, follow_symlinks=not symlinks)
else:
# ignore dangling symlink if the flag is on
if not os.path.exists(linkto) and ignore_dangling_symlinks:
continue
# otherwise let the copy occurs. copy2 will raise an error
if os.path.isdir(srcname):
copytree(srcname, dstname, symlinks, ignore,
if srcentry.is_dir():
copytree(srcobj, dstname, symlinks, ignore,
copy_function)
else:
copy_function(srcname, dstname)
elif os.path.isdir(srcname):
copytree(srcname, dstname, symlinks, ignore, copy_function)
copy_function(srcobj, dstname)
elif srcentry.is_dir():
copytree(srcobj, dstname, symlinks, ignore, copy_function)
else:
# Will raise a SpecialFileError for unsupported file types
copy_function(srcname, dstname)
copy_function(srcentry, dstname)
# catch the Error from the recursive copytree so that we can
# continue with other files
except Error as err:
......@@ -504,6 +488,47 @@ def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
raise Error(errors)
return dst
def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
ignore_dangling_symlinks=False):
"""Recursively copy a directory tree.
The destination directory must not already exist.
If exception(s) occur, an Error is raised with a list of reasons.
If the optional symlinks flag is true, symbolic links in the
source tree result in symbolic links in the destination tree; if
it is false, the contents of the files pointed to by symbolic
links are copied. If the file pointed by the symlink doesn't
exist, an exception will be added in the list of errors raised in
an Error exception at the end of the copy process.
You can set the optional ignore_dangling_symlinks flag to true if you
want to silence this exception. Notice that this has no effect on
platforms that don't support os.symlink.
The optional ignore argument is a callable. If given, it
is called with the `src` parameter, which is the directory
being visited by copytree(), and `names` which is the list of
`src` contents, as returned by os.listdir():
callable(src, names) -> ignored_names
Since copytree() is called recursively, the callable will be
called once for each directory that is copied. It returns a
list of names relative to the `src` directory that should
not be copied.
The optional copy_function argument is a callable that will be used
to copy each file. It will be called with the source path and the
destination path as arguments. By default, copy2() is used, but any
function that supports the same signature (like copy()) can be used.
"""
with os.scandir(src) as entries:
return _copytree(entries=entries, src=src, dst=dst, symlinks=symlinks,
ignore=ignore, copy_function=copy_function,
ignore_dangling_symlinks=ignore_dangling_symlinks)
# version vulnerable to race conditions
def _rmtree_unsafe(path, onerror):
try:
......
:func:`shutil.copytree` uses :func:`os.scandir` function and all copy
functions depending from it use cached :func:`os.stat` values. The speedup
for copying a directory with 8000 files is around +9% on Linux, +20% on
Windows and + 30% on a Windows SMB share. Also the number of :func:`os.stat`
syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
on network filesystems.
(Contributed by Giampaolo Rodola' in :issue:`33695`.)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment