Issue #20491: The textwrap.TextWrapper class now honors non-breaking spaces.

Based on patch by Kaarle Ritvanen.

Issue #20491: The textwrap.TextWrapper class now honors non-breaking spaces.
Based on patch by Kaarle Ritvanen.
aeacde7b · Serhiy Storchaka · 20e5a346 · aeacde7b · aeacde7b · aeacde7b
Commit aeacde7b authored Oct 25, 2016 by Serhiy Storchaka
Hide whitespace changes
Inline Side-by-side

Showing with 47 additions and 14 deletions

Lib/test/test_textwrap.py Lib/test/test_textwrap.py +31 -0

Lib/textwrap.py Lib/textwrap.py +13 -14

Misc/NEWS Misc/NEWS +3 -0

No files found.
--- a/Lib/test/test_textwrap.py
+++ b/Lib/test/test_textwrap.py
@@ -444,6 +444,37 @@ What a mess!
        text = "aa \xe4\xe4-\xe4\xe4"
        self.check_wrap(text, 7, ["aa \xe4\xe4-", "\xe4\xe4"])
+    def test_non_breaking_space(self):
+        text = 'This is a sentence with non-breaking\N{NO-BREAK SPACE}space.'
+        self.check_wrap(text, 20,
+                        ['This is a sentence',
+                         'with non-',
+                         'breaking\N{NO-BREAK SPACE}space.'],
+                        break_on_hyphens=True)
+        self.check_wrap(text, 20,
+                        ['This is a sentence',
+                         'with',
+                         'non-breaking\N{NO-BREAK SPACE}space.'],
+                        break_on_hyphens=False)
+    def test_narrow_non_breaking_space(self):
+        text = ('This is a sentence with non-breaking'
+                '\N{NARROW NO-BREAK SPACE}space.')
+        self.check_wrap(text, 20,
+                        ['This is a sentence',
+                         'with non-',
+                         'breaking\N{NARROW NO-BREAK SPACE}space.'],
+                        break_on_hyphens=True)
+        self.check_wrap(text, 20,
+                        ['This is a sentence',
+                         'with',
+                         'non-breaking\N{NARROW NO-BREAK SPACE}space.'],
+                        break_on_hyphens=False)
 class MaxLinesTestCase(BaseTestCase):
    text = "Hello there, how are you this fine day?  I'm glad to hear it!"

--- a/Lib/textwrap.py
+++ b/Lib/textwrap.py
@@ -10,13 +10,8 @@ import re
 __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
 # Hardcode the recognized whitespace characters to the US-ASCII
-# whitespace characters.  The main reason for doing this is that in
+# whitespace characters.  The main reason for doing this is that
-# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
+# some Unicode spaces (like \u00a0) are non-breaking whitespaces.
-# that character winds up in string.whitespace.  Respecting
-# string.whitespace in those cases would 1) make textwrap treat 0xa0 the
-# same as any other whitespace char, which is clearly wrong (it's a
-# *non-breaking* space), 2) possibly cause problems with Unicode,
-# since 0xa0 is not in range(128).
 _whitespace = '\t\n\x0b\x0c\r '
 class TextWrapper:
@@ -81,29 +76,34 @@ class TextWrapper:
    # (after stripping out empty strings).
    word_punct = r'[\w!"\'&.,?]'
    letter = r'[^\d\W]'
+    whitespace = r'[%s]' % re.escape(_whitespace)
+    nowhitespace = '[^' + whitespace[1:]
    wordsep_re = re.compile(r'''
        ( # any whitespace
-          \s+
+          %(ws)s+
        | # em-dash between words
          (?<=%(wp)s) -{2,} (?=\w)
        | # word, possibly hyphenated
-          \S+? (?:
+          %(nws)s+? (?:
            # hyphenated word
              -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
              (?= %(lt)s -? %(lt)s)
            | # end of word
-              (?=\s|\Z)
+              (?=%(ws)s|\Z)
            | # em-dash
              (?<=%(wp)s) (?=-{2,}\w)
            )
-        )''' % {'wp': word_punct, 'lt': letter}, re.VERBOSE)
+        )''' % {'wp': word_punct, 'lt': letter,
-    del word_punct, letter
+                'ws': whitespace, 'nws': nowhitespace},
+        re.VERBOSE)
+    del word_punct, letter, nowhitespace
    # This less funky little regex just split on recognized spaces. E.g.
    #   "Hello there -- you goof-ball, use the -b option!"
    # splits into
    #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
-    wordsep_simple_re = re.compile(r'(\s+)')
+    wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
+    del whitespace
    # XXX this is not locale- or charset-aware -- string.lowercase
    # is US-ASCII only (and therefore English-only)
@@ -112,7 +112,6 @@ class TextWrapper:
                                 r'[\"\']?'           # optional end-of-quote
                                 r'\Z')               # end of chunk
    def __init__(self,
                 width=70,
                 initial_indent="",

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -113,6 +113,9 @@ Core and Builtins
 Library
 -------
+- Issue #20491: The textwrap.TextWrapper class now honors non-breaking spaces.
+  Based on patch by Kaarle Ritvanen.
 - Issue #28353: os.fwalk() no longer fails on broken links.
 - Issue #25464: Fixed HList.header_exists() in tkinter.tix module by addin