Commit 19c46a4c authored by Giampaolo Rodola's avatar Giampaolo Rodola Committed by GitHub

bpo-33695 shutil.copytree() + os.scandir() cache (#7874)

parent cd449806
...@@ -277,6 +277,14 @@ Optimizations ...@@ -277,6 +277,14 @@ Optimizations
See :ref:`shutil-platform-dependent-efficient-copy-operations` section. See :ref:`shutil-platform-dependent-efficient-copy-operations` section.
(Contributed by Giampaolo Rodola' in :issue:`25427`.) (Contributed by Giampaolo Rodola' in :issue:`25427`.)
* :func:`shutil.copytree` uses :func:`os.scandir` function and all copy
functions depending from it use cached :func:`os.stat` values. The speedup
for copying a directory with 8000 files is around +9% on Linux, +20% on
Windows and +30% on a Windows SMB share. Also the number of :func:`os.stat`
syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
on network filesystems. (Contributed by Giampaolo Rodola' in :issue:`33695`.)
* The default protocol in the :mod:`pickle` module is now Protocol 4, * The default protocol in the :mod:`pickle` module is now Protocol 4,
first introduced in Python 3.4. It offers better performance and smaller first introduced in Python 3.4. It offers better performance and smaller
size compared to Protocol 3 available since Python 3.0. size compared to Protocol 3 available since Python 3.0.
......
...@@ -200,6 +200,12 @@ def copyfileobj(fsrc, fdst, length=COPY_BUFSIZE): ...@@ -200,6 +200,12 @@ def copyfileobj(fsrc, fdst, length=COPY_BUFSIZE):
def _samefile(src, dst): def _samefile(src, dst):
# Macintosh, Unix. # Macintosh, Unix.
if isinstance(src, os.DirEntry) and hasattr(os.path, 'samestat'):
try:
return os.path.samestat(src.stat(), os.stat(dst))
except OSError:
return False
if hasattr(os.path, 'samefile'): if hasattr(os.path, 'samefile'):
try: try:
return os.path.samefile(src, dst) return os.path.samefile(src, dst)
...@@ -210,6 +216,12 @@ def _samefile(src, dst): ...@@ -210,6 +216,12 @@ def _samefile(src, dst):
return (os.path.normcase(os.path.abspath(src)) == return (os.path.normcase(os.path.abspath(src)) ==
os.path.normcase(os.path.abspath(dst))) os.path.normcase(os.path.abspath(dst)))
def _stat(fn):
return fn.stat() if isinstance(fn, os.DirEntry) else os.stat(fn)
def _islink(fn):
return fn.is_symlink() if isinstance(fn, os.DirEntry) else os.path.islink(fn)
def copyfile(src, dst, *, follow_symlinks=True): def copyfile(src, dst, *, follow_symlinks=True):
"""Copy data from src to dst in the most efficient way possible. """Copy data from src to dst in the most efficient way possible.
...@@ -223,18 +235,19 @@ def copyfile(src, dst, *, follow_symlinks=True): ...@@ -223,18 +235,19 @@ def copyfile(src, dst, *, follow_symlinks=True):
file_size = 0 file_size = 0
for i, fn in enumerate([src, dst]): for i, fn in enumerate([src, dst]):
try: try:
st = os.stat(fn) st = _stat(fn)
except OSError: except OSError:
# File most likely does not exist # File most likely does not exist
pass pass
else: else:
# XXX What about other special files? (sockets, devices...) # XXX What about other special files? (sockets, devices...)
if stat.S_ISFIFO(st.st_mode): if stat.S_ISFIFO(st.st_mode):
fn = fn.path if isinstance(fn, os.DirEntry) else fn
raise SpecialFileError("`%s` is a named pipe" % fn) raise SpecialFileError("`%s` is a named pipe" % fn)
if _WINDOWS and i == 0: if _WINDOWS and i == 0:
file_size = st.st_size file_size = st.st_size
if not follow_symlinks and os.path.islink(src): if not follow_symlinks and _islink(src):
os.symlink(os.readlink(src), dst) os.symlink(os.readlink(src), dst)
else: else:
with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst: with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst:
...@@ -270,13 +283,13 @@ def copymode(src, dst, *, follow_symlinks=True): ...@@ -270,13 +283,13 @@ def copymode(src, dst, *, follow_symlinks=True):
(e.g. Linux) this method does nothing. (e.g. Linux) this method does nothing.
""" """
if not follow_symlinks and os.path.islink(src) and os.path.islink(dst): if not follow_symlinks and _islink(src) and os.path.islink(dst):
if hasattr(os, 'lchmod'): if hasattr(os, 'lchmod'):
stat_func, chmod_func = os.lstat, os.lchmod stat_func, chmod_func = os.lstat, os.lchmod
else: else:
return return
elif hasattr(os, 'chmod'): elif hasattr(os, 'chmod'):
stat_func, chmod_func = os.stat, os.chmod stat_func, chmod_func = _stat, os.chmod
else: else:
return return
...@@ -325,7 +338,7 @@ def copystat(src, dst, *, follow_symlinks=True): ...@@ -325,7 +338,7 @@ def copystat(src, dst, *, follow_symlinks=True):
pass pass
# follow symlinks (aka don't not follow symlinks) # follow symlinks (aka don't not follow symlinks)
follow = follow_symlinks or not (os.path.islink(src) and os.path.islink(dst)) follow = follow_symlinks or not (_islink(src) and os.path.islink(dst))
if follow: if follow:
# use the real function if it exists # use the real function if it exists
def lookup(name): def lookup(name):
...@@ -339,6 +352,9 @@ def copystat(src, dst, *, follow_symlinks=True): ...@@ -339,6 +352,9 @@ def copystat(src, dst, *, follow_symlinks=True):
return fn return fn
return _nop return _nop
if isinstance(src, os.DirEntry):
st = src.stat(follow_symlinks=follow)
else:
st = lookup("stat")(src, follow_symlinks=follow) st = lookup("stat")(src, follow_symlinks=follow)
mode = stat.S_IMODE(st.st_mode) mode = stat.S_IMODE(st.st_mode)
lookup("utime")(dst, ns=(st.st_atime_ns, st.st_mtime_ns), lookup("utime")(dst, ns=(st.st_atime_ns, st.st_mtime_ns),
...@@ -415,79 +431,47 @@ def ignore_patterns(*patterns): ...@@ -415,79 +431,47 @@ def ignore_patterns(*patterns):
return set(ignored_names) return set(ignored_names)
return _ignore_patterns return _ignore_patterns
def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2, def _copytree(entries, src, dst, symlinks, ignore, copy_function,
ignore_dangling_symlinks=False): ignore_dangling_symlinks):
"""Recursively copy a directory tree.
The destination directory must not already exist.
If exception(s) occur, an Error is raised with a list of reasons.
If the optional symlinks flag is true, symbolic links in the
source tree result in symbolic links in the destination tree; if
it is false, the contents of the files pointed to by symbolic
links are copied. If the file pointed by the symlink doesn't
exist, an exception will be added in the list of errors raised in
an Error exception at the end of the copy process.
You can set the optional ignore_dangling_symlinks flag to true if you
want to silence this exception. Notice that this has no effect on
platforms that don't support os.symlink.
The optional ignore argument is a callable. If given, it
is called with the `src` parameter, which is the directory
being visited by copytree(), and `names` which is the list of
`src` contents, as returned by os.listdir():
callable(src, names) -> ignored_names
Since copytree() is called recursively, the callable will be
called once for each directory that is copied. It returns a
list of names relative to the `src` directory that should
not be copied.
The optional copy_function argument is a callable that will be used
to copy each file. It will be called with the source path and the
destination path as arguments. By default, copy2() is used, but any
function that supports the same signature (like copy()) can be used.
"""
names = os.listdir(src)
if ignore is not None: if ignore is not None:
ignored_names = ignore(src, names) ignored_names = ignore(src, set(os.listdir(src)))
else: else:
ignored_names = set() ignored_names = set()
os.makedirs(dst) os.makedirs(dst)
errors = [] errors = []
for name in names: use_srcentry = copy_function is copy2 or copy_function is copy
if name in ignored_names:
for srcentry in entries:
if srcentry.name in ignored_names:
continue continue
srcname = os.path.join(src, name) srcname = os.path.join(src, srcentry.name)
dstname = os.path.join(dst, name) dstname = os.path.join(dst, srcentry.name)
srcobj = srcentry if use_srcentry else srcname
try: try:
if os.path.islink(srcname): if srcentry.is_symlink():
linkto = os.readlink(srcname) linkto = os.readlink(srcname)
if symlinks: if symlinks:
# We can't just leave it to `copy_function` because legacy # We can't just leave it to `copy_function` because legacy
# code with a custom `copy_function` may rely on copytree # code with a custom `copy_function` may rely on copytree
# doing the right thing. # doing the right thing.
os.symlink(linkto, dstname) os.symlink(linkto, dstname)
copystat(srcname, dstname, follow_symlinks=not symlinks) copystat(srcobj, dstname, follow_symlinks=not symlinks)
else: else:
# ignore dangling symlink if the flag is on # ignore dangling symlink if the flag is on
if not os.path.exists(linkto) and ignore_dangling_symlinks: if not os.path.exists(linkto) and ignore_dangling_symlinks:
continue continue
# otherwise let the copy occurs. copy2 will raise an error # otherwise let the copy occurs. copy2 will raise an error
if os.path.isdir(srcname): if srcentry.is_dir():
copytree(srcname, dstname, symlinks, ignore, copytree(srcobj, dstname, symlinks, ignore,
copy_function) copy_function)
else: else:
copy_function(srcname, dstname) copy_function(srcobj, dstname)
elif os.path.isdir(srcname): elif srcentry.is_dir():
copytree(srcname, dstname, symlinks, ignore, copy_function) copytree(srcobj, dstname, symlinks, ignore, copy_function)
else: else:
# Will raise a SpecialFileError for unsupported file types # Will raise a SpecialFileError for unsupported file types
copy_function(srcname, dstname) copy_function(srcentry, dstname)
# catch the Error from the recursive copytree so that we can # catch the Error from the recursive copytree so that we can
# continue with other files # continue with other files
except Error as err: except Error as err:
...@@ -504,6 +488,47 @@ def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2, ...@@ -504,6 +488,47 @@ def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
raise Error(errors) raise Error(errors)
return dst return dst
def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
ignore_dangling_symlinks=False):
"""Recursively copy a directory tree.
The destination directory must not already exist.
If exception(s) occur, an Error is raised with a list of reasons.
If the optional symlinks flag is true, symbolic links in the
source tree result in symbolic links in the destination tree; if
it is false, the contents of the files pointed to by symbolic
links are copied. If the file pointed by the symlink doesn't
exist, an exception will be added in the list of errors raised in
an Error exception at the end of the copy process.
You can set the optional ignore_dangling_symlinks flag to true if you
want to silence this exception. Notice that this has no effect on
platforms that don't support os.symlink.
The optional ignore argument is a callable. If given, it
is called with the `src` parameter, which is the directory
being visited by copytree(), and `names` which is the list of
`src` contents, as returned by os.listdir():
callable(src, names) -> ignored_names
Since copytree() is called recursively, the callable will be
called once for each directory that is copied. It returns a
list of names relative to the `src` directory that should
not be copied.
The optional copy_function argument is a callable that will be used
to copy each file. It will be called with the source path and the
destination path as arguments. By default, copy2() is used, but any
function that supports the same signature (like copy()) can be used.
"""
with os.scandir(src) as entries:
return _copytree(entries=entries, src=src, dst=dst, symlinks=symlinks,
ignore=ignore, copy_function=copy_function,
ignore_dangling_symlinks=ignore_dangling_symlinks)
# version vulnerable to race conditions # version vulnerable to race conditions
def _rmtree_unsafe(path, onerror): def _rmtree_unsafe(path, onerror):
try: try:
......
:func:`shutil.copytree` uses :func:`os.scandir` function and all copy
functions depending from it use cached :func:`os.stat` values. The speedup
for copying a directory with 8000 files is around +9% on Linux, +20% on
Windows and + 30% on a Windows SMB share. Also the number of :func:`os.stat`
syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
on network filesystems.
(Contributed by Giampaolo Rodola' in :issue:`33695`.)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment