Commit 566dc35b authored by Jason R. Coombs's avatar Jason R. Coombs Committed by GitHub

Merge pull request #764 from timheap/fast_egg_info_command

Do not scan whole file tree when making MANIFEST
parents 07abd732 bb45468d
......@@ -3,6 +3,7 @@
Create a distribution's .egg-info directory and contents"""
from distutils.filelist import FileList as _FileList
from distutils.errors import DistutilsInternalError
from distutils.util import convert_path
from distutils import log
import distutils.errors
......@@ -27,6 +28,7 @@ from pkg_resources import (
parse_requirements, safe_name, parse_version,
safe_version, yield_lines, EntryPoint, iter_entry_points, to_filename)
import setuptools.unicode_utils as unicode_utils
from setuptools.glob import glob
from pkg_resources.extern import packaging
......@@ -36,6 +38,88 @@ except ImportError:
pass
def translate_pattern(glob):
"""
Translate a file path glob like '*.txt' in to a regular expression.
This differs from fnmatch.translate which allows wildcards to match
directory separators. It also knows about '**/' which matches any number of
directories.
"""
pat = ''
# This will split on '/' within [character classes]. This is deliberate.
chunks = glob.split(os.path.sep)
sep = re.escape(os.sep)
valid_char = '[^%s]' % (sep,)
for c, chunk in enumerate(chunks):
last_chunk = c == len(chunks) - 1
# Chunks that are a literal ** are globstars. They match anything.
if chunk == '**':
if last_chunk:
# Match anything if this is the last component
pat += '.*'
else:
# Match '(name/)*'
pat += '(?:%s+%s)*' % (valid_char, sep)
continue # Break here as the whole path component has been handled
# Find any special characters in the remainder
i = 0
chunk_len = len(chunk)
while i < chunk_len:
char = chunk[i]
if char == '*':
# Match any number of name characters
pat += valid_char + '*'
elif char == '?':
# Match a name character
pat += valid_char
elif char == '[':
# Character class
inner_i = i + 1
# Skip initial !/] chars
if inner_i < chunk_len and chunk[inner_i] == '!':
inner_i = inner_i + 1
if inner_i < chunk_len and chunk[inner_i] == ']':
inner_i = inner_i + 1
# Loop till the closing ] is found
while inner_i < chunk_len and chunk[inner_i] != ']':
inner_i = inner_i + 1
if inner_i >= chunk_len:
# Got to the end of the string without finding a closing ]
# Do not treat this as a matching group, but as a literal [
pat += re.escape(char)
else:
# Grab the insides of the [brackets]
inner = chunk[i + 1:inner_i]
char_class = ''
# Class negation
if inner[0] == '!':
char_class = '^'
inner = inner[1:]
char_class += re.escape(inner)
pat += '[%s]' % (char_class,)
# Skip to the end ]
i = inner_i
else:
pat += re.escape(char)
i += 1
# Join each chunk with the dir separator
if not last_chunk:
pat += sep
return re.compile(pat + r'\Z(?ms)')
class egg_info(Command):
description = "create a distribution's .egg-info directory"
......@@ -239,7 +323,151 @@ class egg_info(Command):
class FileList(_FileList):
"""File list that accepts only existing, platform-independent paths"""
# Implementations of the various MANIFEST.in commands
def process_template_line(self, line):
# Parse the line: split it up, make sure the right number of words
# is there, and return the relevant words. 'action' is always
# defined: it's the first word of the line. Which of the other
# three are defined depends on the action; it'll be either
# patterns, (dir and patterns), or (dir_pattern).
(action, patterns, dir, dir_pattern) = self._parse_template_line(line)
# OK, now we know that the action is valid and we have the
# right number of words on the line for that action -- so we
# can proceed with minimal error-checking.
if action == 'include':
self.debug_print("include " + ' '.join(patterns))
for pattern in patterns:
if not self.include(pattern):
log.warn("warning: no files found matching '%s'", pattern)
elif action == 'exclude':
self.debug_print("exclude " + ' '.join(patterns))
for pattern in patterns:
if not self.exclude(pattern):
log.warn(("warning: no previously-included files "
"found matching '%s'"), pattern)
elif action == 'global-include':
self.debug_print("global-include " + ' '.join(patterns))
for pattern in patterns:
if not self.global_include(pattern):
log.warn(("warning: no files found matching '%s' "
"anywhere in distribution"), pattern)
elif action == 'global-exclude':
self.debug_print("global-exclude " + ' '.join(patterns))
for pattern in patterns:
if not self.global_exclude(pattern):
log.warn(("warning: no previously-included files matching "
"'%s' found anywhere in distribution"),
pattern)
elif action == 'recursive-include':
self.debug_print("recursive-include %s %s" %
(dir, ' '.join(patterns)))
for pattern in patterns:
if not self.recursive_include(dir, pattern):
log.warn(("warning: no files found matching '%s' "
"under directory '%s'"),
pattern, dir)
elif action == 'recursive-exclude':
self.debug_print("recursive-exclude %s %s" %
(dir, ' '.join(patterns)))
for pattern in patterns:
if not self.recursive_exclude(dir, pattern):
log.warn(("warning: no previously-included files matching "
"'%s' found under directory '%s'"),
pattern, dir)
elif action == 'graft':
self.debug_print("graft " + dir_pattern)
if not self.graft(dir_pattern):
log.warn("warning: no directories found matching '%s'",
dir_pattern)
elif action == 'prune':
self.debug_print("prune " + dir_pattern)
if not self.prune(dir_pattern):
log.warn(("no previously-included directories found "
"matching '%s'"), dir_pattern)
else:
raise DistutilsInternalError(
"this cannot happen: invalid action '%s'" % action)
def _remove_files(self, predicate):
"""
Remove all files from the file list that match the predicate.
Return True if any matching files were removed
"""
found = False
for i in range(len(self.files) - 1, -1, -1):
if predicate(self.files[i]):
self.debug_print(" removing " + self.files[i])
del self.files[i]
found = True
return found
def include(self, pattern):
"""Include files that match 'pattern'."""
found = [f for f in glob(pattern) if not os.path.isdir(f)]
self.extend(found)
return bool(found)
def exclude(self, pattern):
"""Exclude files that match 'pattern'."""
match = translate_pattern(pattern)
return self._remove_files(match.match)
def recursive_include(self, dir, pattern):
"""
Include all files anywhere in 'dir/' that match the pattern.
"""
full_pattern = os.path.join(dir, '**', pattern)
found = [f for f in glob(full_pattern, recursive=True)
if not os.path.isdir(f)]
self.extend(found)
return bool(found)
def recursive_exclude(self, dir, pattern):
"""
Exclude any file anywhere in 'dir/' that match the pattern.
"""
match = translate_pattern(os.path.join(dir, '**', pattern))
return self._remove_files(match.match)
def graft(self, dir):
"""Include all files from 'dir/'."""
found = distutils.filelist.findall(dir)
self.extend(found)
return bool(found)
def prune(self, dir):
"""Filter out files from 'dir/'."""
match = translate_pattern(os.path.join(dir, '**'))
return self._remove_files(match.match)
def global_include(self, pattern):
"""
Include all files anywhere in the current directory that match the
pattern. This is very inefficient on large file trees.
"""
if self.allfiles is None:
self.findall()
match = translate_pattern(os.path.join('**', pattern))
found = [f for f in self.allfiles if match.match(f)]
self.extend(found)
return bool(found)
def global_exclude(self, pattern):
"""
Exclude all files anywhere that match the pattern.
"""
match = translate_pattern(os.path.join('**', pattern))
return self._remove_files(match.match)
def append(self, item):
if item.endswith('\r'): # Fix older sdists built on Windows
......@@ -302,7 +530,6 @@ class manifest_maker(sdist):
self.filelist = FileList()
if not os.path.exists(self.manifest):
self.write_manifest() # it must exist so it'll get in the list
self.filelist.findall()
self.add_defaults()
if os.path.exists(self.template):
self.read_template()
......@@ -341,38 +568,13 @@ class manifest_maker(sdist):
elif os.path.exists(self.manifest):
self.read_manifest()
ei_cmd = self.get_finalized_command('egg_info')
self._add_egg_info(cmd=ei_cmd)
self.filelist.include_pattern("*", prefix=ei_cmd.egg_info)
def _add_egg_info(self, cmd):
"""
Add paths for egg-info files for an external egg-base.
The egg-info files are written to egg-base. If egg-base is
outside the current working directory, this method
searchs the egg-base directory for files to include
in the manifest. Uses distutils.filelist.findall (which is
really the version monkeypatched in by setuptools/__init__.py)
to perform the search.
Since findall records relative paths, prefix the returned
paths with cmd.egg_base, so add_default's include_pattern call
(which is looking for the absolute cmd.egg_info) will match
them.
"""
if cmd.egg_base == os.curdir:
# egg-info files were already added by something else
return
discovered = distutils.filelist.findall(cmd.egg_base)
resolved = (os.path.join(cmd.egg_base, path) for path in discovered)
self.filelist.allfiles.extend(resolved)
self.filelist.graft(ei_cmd.egg_info)
def prune_file_list(self):
build = self.get_finalized_command('build')
base_dir = self.distribution.get_fullname()
self.filelist.exclude_pattern(None, prefix=build.build_base)
self.filelist.exclude_pattern(None, prefix=base_dir)
self.filelist.prune(build.build_base)
self.filelist.prune(base_dir)
sep = re.escape(os.sep)
self.filelist.exclude_pattern(r'(^|' + sep + r')(RCS|CVS|\.svn)' + sep,
is_regex=1)
......
"""
Filename globbing utility. Mostly a copy of `glob` from Python 3.5.
Changes include:
* `yield from` and PEP3102 `*` removed.
* `bytes` changed to `six.binary_type`.
* Hidden files are not ignored.
"""
import os
import re
import fnmatch
from setuptools.extern.six import binary_type
__all__ = ["glob", "iglob", "escape"]
def glob(pathname, recursive=False):
"""Return a list of paths matching a pathname pattern.
The pattern may contain simple shell-style wildcards a la
fnmatch. However, unlike fnmatch, filenames starting with a
dot are special cases that are not matched by '*' and '?'
patterns.
If recursive is true, the pattern '**' will match any files and
zero or more directories and subdirectories.
"""
return list(iglob(pathname, recursive=recursive))
def iglob(pathname, recursive=False):
"""Return an iterator which yields the paths matching a pathname pattern.
The pattern may contain simple shell-style wildcards a la
fnmatch. However, unlike fnmatch, filenames starting with a
dot are special cases that are not matched by '*' and '?'
patterns.
If recursive is true, the pattern '**' will match any files and
zero or more directories and subdirectories.
"""
it = _iglob(pathname, recursive)
if recursive and _isrecursive(pathname):
s = next(it) # skip empty string
assert not s
return it
def _iglob(pathname, recursive):
dirname, basename = os.path.split(pathname)
if not has_magic(pathname):
if basename:
if os.path.lexists(pathname):
yield pathname
else:
# Patterns ending with a slash should match only directories
if os.path.isdir(dirname):
yield pathname
return
if not dirname:
if recursive and _isrecursive(basename):
for x in glob2(dirname, basename):
yield x
else:
for x in glob1(dirname, basename):
yield x
return
# `os.path.split()` returns the argument itself as a dirname if it is a
# drive or UNC path. Prevent an infinite recursion if a drive or UNC path
# contains magic characters (i.e. r'\\?\C:').
if dirname != pathname and has_magic(dirname):
dirs = _iglob(dirname, recursive)
else:
dirs = [dirname]
if has_magic(basename):
if recursive and _isrecursive(basename):
glob_in_dir = glob2
else:
glob_in_dir = glob1
else:
glob_in_dir = glob0
for dirname in dirs:
for name in glob_in_dir(dirname, basename):
yield os.path.join(dirname, name)
# These 2 helper functions non-recursively glob inside a literal directory.
# They return a list of basenames. `glob1` accepts a pattern while `glob0`
# takes a literal basename (so it only has to check for its existence).
def glob1(dirname, pattern):
if not dirname:
if isinstance(pattern, binary_type):
dirname = os.curdir.encode('ASCII')
else:
dirname = os.curdir
try:
names = os.listdir(dirname)
except OSError:
return []
return fnmatch.filter(names, pattern)
def glob0(dirname, basename):
if not basename:
# `os.path.split()` returns an empty basename for paths ending with a
# directory separator. 'q*x/' should match only directories.
if os.path.isdir(dirname):
return [basename]
else:
if os.path.lexists(os.path.join(dirname, basename)):
return [basename]
return []
# This helper function recursively yields relative pathnames inside a literal
# directory.
def glob2(dirname, pattern):
assert _isrecursive(pattern)
yield pattern[:0]
for x in _rlistdir(dirname):
yield x
# Recursively yields relative pathnames inside a literal directory.
def _rlistdir(dirname):
if not dirname:
if isinstance(dirname, binary_type):
dirname = binary_type(os.curdir, 'ASCII')
else:
dirname = os.curdir
try:
names = os.listdir(dirname)
except os.error:
return
for x in names:
yield x
path = os.path.join(dirname, x) if dirname else x
for y in _rlistdir(path):
yield os.path.join(x, y)
magic_check = re.compile('([*?[])')
magic_check_bytes = re.compile(b'([*?[])')
def has_magic(s):
if isinstance(s, binary_type):
match = magic_check_bytes.search(s)
else:
match = magic_check.search(s)
return match is not None
def _isrecursive(pattern):
if isinstance(pattern, binary_type):
return pattern == b'**'
else:
return pattern == '**'
def escape(pathname):
"""Escape all special characters.
"""
# Escaping is done by wrapping any of "*?[" between square brackets.
# Metacharacters do not work in the drive part and shouldn't be escaped.
drive, pathname = os.path.splitdrive(pathname)
if isinstance(pathname, binary_type):
pathname = magic_check_bytes.sub(br'[\1]', pathname)
else:
pathname = magic_check.sub(r'[\1]', pathname)
return drive + pathname
......@@ -9,7 +9,7 @@ import tempfile
from distutils import log
from distutils.errors import DistutilsTemplateError
from setuptools.command.egg_info import FileList, egg_info
from setuptools.command.egg_info import FileList, egg_info, translate_pattern
from setuptools.dist import Distribution
from setuptools.extern import six
from setuptools.tests.textwrap import DALS
......@@ -66,6 +66,34 @@ default_files = frozenset(map(make_local_path, [
]))
def get_pattern(glob):
return translate_pattern(make_local_path(glob)).pattern
def test_translated_pattern_test():
l = make_local_path
assert get_pattern('foo') == r'foo\Z(?ms)'
assert get_pattern(l('foo/bar')) == l(r'foo\/bar\Z(?ms)')
# Glob matching
assert get_pattern('*.txt') == l(r'[^\/]*\.txt\Z(?ms)')
assert get_pattern('dir/*.txt') == l(r'dir\/[^\/]*\.txt\Z(?ms)')
assert get_pattern('*/*.py') == l(r'[^\/]*\/[^\/]*\.py\Z(?ms)')
assert get_pattern('docs/page-?.txt') \
== l(r'docs\/page\-[^\/]\.txt\Z(?ms)')
# Globstars change what they mean depending upon where they are
assert get_pattern(l('foo/**/bar')) == l(r'foo\/(?:[^\/]+\/)*bar\Z(?ms)')
assert get_pattern(l('foo/**')) == l(r'foo\/.*\Z(?ms)')
assert get_pattern(l('**')) == r'.*\Z(?ms)'
# Character classes
assert get_pattern('pre[one]post') == r'pre[one]post\Z(?ms)'
assert get_pattern('hello[!one]world') == r'hello[^one]world\Z(?ms)'
assert get_pattern('[]one].txt') == r'[\]one]\.txt\Z(?ms)'
assert get_pattern('foo[!]one]bar') == r'foo[^\]one]bar\Z(?ms)'
class TempDirTestCase(object):
def setup_method(self, method):
......@@ -346,23 +374,21 @@ class TestFileListTest(TempDirTestCase):
def test_include_pattern(self):
# return False if no match
file_list = FileList()
file_list.set_allfiles([])
self.make_files([])
assert not file_list.include_pattern('*.py')
# return True if files match
file_list = FileList()
file_list.set_allfiles(['a.py', 'b.txt'])
self.make_files(['a.py', 'b.txt'])
assert file_list.include_pattern('*.py')
# test * matches all files
file_list = FileList()
assert file_list.allfiles is None
file_list.set_allfiles(['a.py', 'b.txt'])
self.make_files(['a.py', 'b.txt'])
file_list.include_pattern('*')
assert file_list.allfiles == ['a.py', 'b.txt']
assert file_list.files == ['a.py', 'b.txt']
def test_process_template(self):
l = make_local_path
def test_process_template_line_invalid(self):
# invalid lines
file_list = FileList()
for action in ('include', 'exclude', 'global-include',
......@@ -377,9 +403,11 @@ class TestFileListTest(TempDirTestCase):
else:
assert False, "Should have thrown an error"
def test_include(self):
l = make_local_path
# include
file_list = FileList()
file_list.set_allfiles(['a.py', 'b.txt', l('d/c.py')])
self.make_files(['a.py', 'b.txt', l('d/c.py')])
file_list.process_template_line('include *.py')
assert file_list.files == ['a.py']
......@@ -389,6 +417,8 @@ class TestFileListTest(TempDirTestCase):
assert file_list.files == ['a.py']
self.assertWarnings()
def test_exclude(self):
l = make_local_path
# exclude
file_list = FileList()
file_list.files = ['a.py', 'b.txt', l('d/c.py')]
......@@ -401,9 +431,11 @@ class TestFileListTest(TempDirTestCase):
assert file_list.files == ['b.txt', l('d/c.py')]
self.assertWarnings()
def test_global_include(self):
l = make_local_path
# global-include
file_list = FileList()
file_list.set_allfiles(['a.py', 'b.txt', l('d/c.py')])
self.make_files(['a.py', 'b.txt', l('d/c.py')])
file_list.process_template_line('global-include *.py')
assert file_list.files == ['a.py', l('d/c.py')]
......@@ -413,6 +445,8 @@ class TestFileListTest(TempDirTestCase):
assert file_list.files == ['a.py', l('d/c.py')]
self.assertWarnings()
def test_global_exclude(self):
l = make_local_path
# global-exclude
file_list = FileList()
file_list.files = ['a.py', 'b.txt', l('d/c.py')]
......@@ -425,10 +459,11 @@ class TestFileListTest(TempDirTestCase):
assert file_list.files == ['b.txt']
self.assertWarnings()
def test_recursive_include(self):
l = make_local_path
# recursive-include
file_list = FileList()
file_list.set_allfiles(['a.py', l('d/b.py'), l('d/c.txt'),
l('d/d/e.py')])
self.make_files(['a.py', l('d/b.py'), l('d/c.txt'), l('d/d/e.py')])
file_list.process_template_line('recursive-include d *.py')
assert file_list.files == [l('d/b.py'), l('d/d/e.py')]
......@@ -438,6 +473,8 @@ class TestFileListTest(TempDirTestCase):
assert file_list.files == [l('d/b.py'), l('d/d/e.py')]
self.assertWarnings()
def test_recursive_exclude(self):
l = make_local_path
# recursive-exclude
file_list = FileList()
file_list.files = ['a.py', l('d/b.py'), l('d/c.txt'), l('d/d/e.py')]
......@@ -450,10 +487,11 @@ class TestFileListTest(TempDirTestCase):
assert file_list.files == ['a.py', l('d/c.txt')]
self.assertWarnings()
def test_graft(self):
l = make_local_path
# graft
file_list = FileList()
file_list.set_allfiles(['a.py', l('d/b.py'), l('d/d/e.py'),
l('f/f.py')])
self.make_files(['a.py', l('d/b.py'), l('d/d/e.py'), l('f/f.py')])
file_list.process_template_line('graft d')
assert file_list.files == [l('d/b.py'), l('d/d/e.py')]
......@@ -463,6 +501,8 @@ class TestFileListTest(TempDirTestCase):
assert file_list.files == [l('d/b.py'), l('d/d/e.py')]
self.assertWarnings()
def test_prune(self):
l = make_local_path
# prune
file_list = FileList()
file_list.files = ['a.py', l('d/b.py'), l('d/d/e.py'), l('f/f.py')]
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment