bpo-28595: Allow shlex whitespace_split with punctuation_chars (GH-2071)

56624a99 · Evan · Vinay Sajip · 2b843ac0 · 56624a99 · 56624a99
Commit 56624a99 authored Jun 02, 2019 by Evan Committed by Vinay Sajip Jun 01, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 61 additions and 23 deletions

Doc/library/shlex.rst Doc/library/shlex.rst +23 -12

Lib/shlex.py Lib/shlex.py +2 -1

Lib/test/test_shlex.py Lib/test/test_shlex.py +36 -10

No files found.
--- a/Doc/library/shlex.rst
+++ b/Doc/library/shlex.rst
@@ -225,7 +225,8 @@ variables which either control lexical analysis or can be used for debugging:
   appear in filename specifications and command line parameters, will also be
   included in this attribute, and any characters which appear in
   ``punctuation_chars`` will be removed from ``wordchars`` if they are present
-   there.
+   there. If :attr:`whitespace_split` is set to ``True``, this will have no
+   effect.


 .. attribute:: shlex.whitespace
@@ -258,11 +259,13 @@ variables which either control lexical analysis or can be used for debugging:

   If ``True``, tokens will only be split in whitespaces.  This is useful, for
   example, for parsing command lines with :class:`~shlex.shlex`, getting
-   tokens in a similar way to shell arguments.  If this attribute is ``True``,
-   :attr:`punctuation_chars` will have no effect, and splitting will happen
-   only on whitespaces.  When using :attr:`punctuation_chars`, which is
-   intended to provide parsing closer to that implemented by shells, it is
-   advisable to leave ``whitespace_split`` as ``False`` (the default value).
+   tokens in a similar way to shell arguments.  When used in combination with
+   :attr:`punctuation_chars`, tokens will be split on whitespace in addition to
+   those characters.
+
+   .. versionchanged:: 3.8
+      The :attr:`punctuation_chars` attribute was made compatible with the
+      :attr:`whitespace_split` attribute.


 .. attribute:: shlex.infile
@@ -398,12 +401,15 @@ otherwise.  To illustrate, you can see the difference in the following snippet:

    >>> import shlex
    >>> text = "a && b; c && d || e; f >'abc'; (def \"ghi\")"
-    >>> list(shlex.shlex(text))
-    ['a', '&', '&', 'b', ';', 'c', '&', '&', 'd', '|', '|', 'e', ';', 'f', '>',
-    "'abc'", ';', '(', 'def', '"ghi"', ')']
-    >>> list(shlex.shlex(text, punctuation_chars=True))
-    ['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', "'abc'",
-    ';', '(', 'def', '"ghi"', ')']
+    >>> s = shlex.shlex(text, posix=True)
+    >>> s.whitespace_split = True
+    >>> list(s)
+    ['a', '&&', 'b;', 'c', '&&', 'd', '||', 'e;', 'f', '>abc;', '(def', 'ghi)']
+    >>> s = shlex.shlex(text, posix=True, punctuation_chars=True)
+    >>> s.whitespace_split = True
+    >>> list(s)
+    ['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', 'abc', ';',
+    '(', 'def', 'ghi', ')']

 Of course, tokens will be returned which are not valid for shells, and you'll
 need to implement your own error checks on the returned tokens.
@@ -428,6 +434,11 @@ which characters constitute punctuation. For example::
      >>> list(s)
      ['~/a', '&&', 'b-c', '--color=auto', '||', 'd', '*.py?']

+   However, to match the shell as closely as possible, it is recommended to
+   always use ``posix`` and :attr:`~shlex.whitespace_split` when using
+   :attr:`~shlex.punctuation_chars`, which will negate
+   :attr:`~shlex.wordchars` entirely.
+
 For best effect, ``punctuation_chars`` should be set in conjunction with
 ``posix=True``. (Note that ``posix=False`` is the default for
 :class:`~shlex.shlex`.)
--- a/Lib/shlex.py
+++ b/Lib/shlex.py
@@ -246,7 +246,8 @@ class shlex:
                    escapedstate = 'a'
                    self.state = nextchar
                elif (nextchar in self.wordchars or nextchar in self.quotes
-                      or self.whitespace_split):
+                      or (self.whitespace_split and
+                          nextchar not in self.punctuation_chars)):
                    self.token += nextchar
                else:
                    if self.punctuation_chars:

--- a/Lib/test/test_shlex.py
+++ b/Lib/test/test_shlex.py
 import io
+import itertools
 import shlex
 import string
 import unittest
@@ -183,10 +184,12 @@ class ShlexTest(unittest.TestCase):
            src = ['echo hi %s echo bye' % delimiter,
                   'echo hi%secho bye' % delimiter]
            ref = ['echo', 'hi', delimiter, 'echo', 'bye']
-            for ss in src:
+            for ss, ws in itertools.product(src, (False, True)):
                s = shlex.shlex(ss, punctuation_chars=True)
+                s.whitespace_split = ws
                result = list(s)
-                self.assertEqual(ref, result, "While splitting '%s'" % ss)
+                self.assertEqual(ref, result,
+                                 "While splitting '%s' [ws=%s]" % (ss, ws))

    def testSyntaxSplitSemicolon(self):
        """Test handling of syntax splitting of ;"""
@@ -197,10 +200,12 @@ class ShlexTest(unittest.TestCase):
                   'echo hi%s echo bye' % delimiter,
                   'echo hi%secho bye' % delimiter]
            ref = ['echo', 'hi', delimiter, 'echo', 'bye']
-            for ss in src:
+            for ss, ws in itertools.product(src, (False, True)):
                s = shlex.shlex(ss, punctuation_chars=True)
+                s.whitespace_split = ws
                result = list(s)
-                self.assertEqual(ref, result, "While splitting '%s'" % ss)
+                self.assertEqual(ref, result,
+                                 "While splitting '%s' [ws=%s]" % (ss, ws))

    def testSyntaxSplitRedirect(self):
        """Test handling of syntax splitting of >"""
@@ -211,10 +216,11 @@ class ShlexTest(unittest.TestCase):
                   'echo hi%s out' % delimiter,
                   'echo hi%sout' % delimiter]
            ref = ['echo', 'hi', delimiter, 'out']
-            for ss in src:
+            for ss, ws in itertools.product(src, (False, True)):
                s = shlex.shlex(ss, punctuation_chars=True)
                result = list(s)
-                self.assertEqual(ref, result, "While splitting '%s'" % ss)
+                self.assertEqual(ref, result,
+                                 "While splitting '%s' [ws=%s]" % (ss, ws))

    def testSyntaxSplitParen(self):
        """Test handling of syntax splitting of ()"""
@@ -222,18 +228,25 @@ class ShlexTest(unittest.TestCase):
        src = ['( echo hi )',
               '(echo hi)']
        ref = ['(', 'echo', 'hi', ')']
-        for ss in src:
+        for ss, ws in itertools.product(src, (False, True)):
            s = shlex.shlex(ss, punctuation_chars=True)
+            s.whitespace_split = ws
            result = list(s)
-            self.assertEqual(ref, result, "While splitting '%s'" % ss)
+            self.assertEqual(ref, result,
+                             "While splitting '%s' [ws=%s]" % (ss, ws))

    def testSyntaxSplitCustom(self):
        """Test handling of syntax splitting with custom chars"""
+        ss = "~/a&&b-c --color=auto||d *.py?"
        ref = ['~/a', '&', '&', 'b-c', '--color=auto', '||', 'd', '*.py?']
-        ss = "~/a && b-c --color=auto || d *.py?"
        s = shlex.shlex(ss, punctuation_chars="|")
        result = list(s)
-        self.assertEqual(ref, result, "While splitting '%s'" % ss)
+        self.assertEqual(ref, result, "While splitting '%s' [ws=False]" % ss)
+        ref = ['~/a&&b-c', '--color=auto', '||', 'd', '*.py?']
+        s = shlex.shlex(ss, punctuation_chars="|")
+        s.whitespace_split = True
+        result = list(s)
+        self.assertEqual(ref, result, "While splitting '%s' [ws=True]" % ss)

    def testTokenTypes(self):
        """Test that tokens are split with types as expected."""
@@ -293,6 +306,19 @@ class ShlexTest(unittest.TestCase):
        s = shlex.shlex("'')abc", punctuation_chars=True)
        self.assertEqual(list(s), expected)

+    def testUnicodeHandling(self):
+        """Test punctuation_chars and whitespace_split handle unicode."""
+        ss = "\u2119\u01b4\u2602\u210c\u00f8\u1f24"
+        # Should be parsed as one complete token (whitespace_split=True).
+        ref = ['\u2119\u01b4\u2602\u210c\u00f8\u1f24']
+        s = shlex.shlex(ss, punctuation_chars=True)
+        s.whitespace_split = True
+        self.assertEqual(list(s), ref)
+        # Without whitespace_split, uses wordchars and splits on all.
+        ref = ['\u2119', '\u01b4', '\u2602', '\u210c', '\u00f8', '\u1f24']
+        s = shlex.shlex(ss, punctuation_chars=True)
+        self.assertEqual(list(s), ref)
+
    def testQuote(self):
        safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./'
        unicode_sample = '\xe9\xe0\xdf'  # e + acute accent, a + grave, sharp s