Issue #21765: Add support for non-ascii identifiers to HyperParser

2e4394ee · Tal Einat · b5cace89 · 9b7f9e6c · 2e4394ee · 2e4394ee
Commit 2e4394ee authored Jul 16, 2014 by Tal Einat
4 changed files
--- a/Lib/idlelib/HyperParser.py
+++ b/Lib/idlelib/HyperParser.py
@@ -6,11 +6,24 @@ the structure of code.
 """

 import string
-import keyword
+from keyword import iskeyword
 from idlelib import PyParse

-class HyperParser:

+# all ASCII chars that may be in an identifier
+_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_")
+# all ASCII chars that may be the first char of an identifier
+_ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_")
+
+# lookup table for whether 7-bit ASCII chars are valid in a Python identifier
+_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)]
+# lookup table for whether 7-bit ASCII chars are valid as the first
+# char in a Python identifier
+_IS_ASCII_ID_FIRST_CHAR = \
+    [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)]
+
+
+class HyperParser:
    def __init__(self, editwin, index):
        "To initialize, analyze the surroundings of the given index."

@@ -143,26 +156,70 @@ class HyperParser:

        return beforeindex, afterindex

-    # Ascii chars that may be in a white space
-    _whitespace_chars = " \t\n\\"
-    # Ascii chars that may be in an identifier
-    _id_chars = string.ascii_letters + string.digits + "_"
-    # Ascii chars that may be the first char of an identifier
-    _id_first_chars = string.ascii_letters + "_"
-
-    # Given a string and pos, return the number of chars in the
-    # identifier which ends at pos, or 0 if there is no such one. Saved
-    # words are not identifiers.
-    def _eat_identifier(self, str, limit, pos):
+    # the set of built-in identifiers which are also keywords,
+    # i.e. keyword.iskeyword() returns True for them
+    _ID_KEYWORDS = frozenset({"True", "False", "None"})
+
+    @classmethod
+    def _eat_identifier(cls, str, limit, pos):
+        """Given a string and pos, return the number of chars in the
+        identifier which ends at pos, or 0 if there is no such one.
+
+        This ignores non-identifier eywords are not identifiers.
+        """
+        is_ascii_id_char = _IS_ASCII_ID_CHAR
+
+        # Start at the end (pos) and work backwards.
        i = pos
-        while i > limit and str[i-1] in self._id_chars:
+
+        # Go backwards as long as the characters are valid ASCII
+        # identifier characters. This is an optimization, since it
+        # is faster in the common case where most of the characters
+        # are ASCII.
+        while i > limit and (
+                ord(str[i - 1]) < 128 and
+                is_ascii_id_char[ord(str[i - 1])]
+        ):
            i -= 1
-        if (i < pos and (str[i] not in self._id_first_chars or
-            (keyword.iskeyword(str[i:pos]) and
-             str[i:pos] not in {'None', 'False', 'True'}))):
-            i = pos
+
+        # If the above loop ended due to reaching a non-ASCII
+        # character, continue going backwards using the most generic
+        # test for whether a string contains only valid identifier
+        # characters.
+        if i > limit and ord(str[i - 1]) >= 128:
+            while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier():
+                i -= 4
+            if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier():
+                i -= 2
+            if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier():
+                i -= 1
+
+            # The identifier candidate starts here. If it isn't a valid
+            # identifier, don't eat anything. At this point that is only
+            # possible if the first character isn't a valid first
+            # character for an identifier.
+            if not str[i:pos].isidentifier():
+                return 0
+        elif i < pos:
+            # All characters in str[i:pos] are valid ASCII identifier
+            # characters, so it is enough to check that the first is
+            # valid as the first character of an identifier.
+            if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]:
+                return 0
+
+        # All keywords are valid identifiers, but should not be
+        # considered identifiers here, except for True, False and None.
+        if i < pos and (
+                iskeyword(str[i:pos]) and
+                str[i:pos] not in cls._ID_KEYWORDS
+        ):
+            return 0
+
        return pos - i

+    # This string includes all chars that may be in a white space
+    _whitespace_chars = " \t\n\\"
+
    def get_expression(self):
        """Return a string with the Python expression which ends at the
        given index, which is empty if there is no real one.

--- a/Lib/idlelib/PyParse.py
+++ b/Lib/idlelib/PyParse.py
 import re
 import sys
+from collections import Mapping
+from functools import partial

 # Reason last stmt is continued (or C_NONE if it's not).
 (C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
@@ -91,19 +93,48 @@ _chew_ordinaryre = re.compile(r"""
    [^[\](){}#'"\\]+
 """, re.VERBOSE).match

-# Build translation table to map uninteresting chars to "x", open
-# brackets to "(", and close brackets to ")".

-_tran = {}
-for i in range(256):
-    _tran[i] = 'x'
-for ch in "({[":
-    _tran[ord(ch)] = '('
-for ch in ")}]":
-    _tran[ord(ch)] = ')'
-for ch in "\"'\\\n#":
-    _tran[ord(ch)] = ch
-del i, ch
+class StringTranslatePseudoMapping(Mapping):
+    r"""Utility class to be used with str.translate()
+
+    This Mapping class wraps a given dict. When a value for a key is
+    requested via __getitem__() or get(), the key is looked up in the
+    given dict. If found there, the value from the dict is returned.
+    Otherwise, the default value given upon initialization is returned.
+
+    This allows using str.translate() to make some replacements, and to
+    replace all characters for which no replacement was specified with
+    a given character instead of leaving them as-is.
+
+    For example, to replace everything except whitespace with 'x':
+
+    >>> whitespace_chars = ' \t\n\r'
+    >>> preserve_dict = {ord(c): ord(c) for c in whitespace_chars}
+    >>> mapping = StringTranslatePseudoMapping(preserve_dict, ord('x'))
+    >>> text = "a + b\tc\nd"
+    >>> text.translate(mapping)
+    'x x x\tx\nx'
+    """
+    def __init__(self, non_defaults, default_value):
+        self._non_defaults = non_defaults
+        self._default_value = default_value
+
+        def _get(key, _get=non_defaults.get, _default=default_value):
+            return _get(key, _default)
+        self._get = _get
+
+    def __getitem__(self, item):
+        return self._get(item)
+
+    def __len__(self):
+        return len(self._non_defaults)
+
+    def __iter__(self):
+        return iter(self._non_defaults)
+
+    def get(self, key, default=None):
+        return self._get(key)
+

 class Parser:

@@ -113,19 +144,6 @@ class Parser:

    def set_str(self, s):
        assert len(s) == 0 or s[-1] == '\n'
-        if isinstance(s, str):
-            # The parse functions have no idea what to do with Unicode, so
-            # replace all Unicode characters with "x".  This is "safe"
-            # so long as the only characters germane to parsing the structure
-            # of Python are 7-bit ASCII.  It's *necessary* because Unicode
-            # strings don't have a .translate() method that supports
-            # deletechars.
-            uniphooey = s
-            s = []
-            push = s.append
-            for raw in map(ord, uniphooey):
-                push(raw < 127 and chr(raw) or "x")
-            s = "".join(s)
        self.str = s
        self.study_level = 0

@@ -197,6 +215,16 @@ class Parser:
        if lo > 0:
            self.str = self.str[lo:]

+    # Build a translation table to map uninteresting chars to 'x', open
+    # brackets to '(', close brackets to ')' while preserving quotes,
+    # backslashes, newlines and hashes. This is to be passed to
+    # str.translate() in _study1().
+    _tran = {}
+    _tran.update((ord(c), ord('(')) for c in "({[")
+    _tran.update((ord(c), ord(')')) for c in ")}]")
+    _tran.update((ord(c), ord(c)) for c in "\"'\\\n#")
+    _tran = StringTranslatePseudoMapping(_tran, default_value=ord('x'))
+
    # As quickly as humanly possible <wink>, find the line numbers (0-
    # based) of the non-continuation lines.
    # Creates self.{goodlines, continuation}.
@@ -211,7 +239,7 @@ class Parser:
        # uninteresting characters.  This can cut the number of chars
        # by a factor of 10-40, and so greatly speed the following loop.
        str = self.str
-        str = str.translate(_tran)
+        str = str.translate(self._tran)
        str = str.replace('xxxxxxxx', 'x')
        str = str.replace('xxxx', 'x')
        str = str.replace('xx', 'x')

--- a/Lib/idlelib/idle_test/test_hyperparser.py
+++ b/Lib/idlelib/idle_test/test_hyperparser.py
@@ -30,6 +30,7 @@ class HyperParserTest(unittest.TestCase):
            "z = ((r'asdf')+('a')))\n"
            '[x for x in\n'
            'for = False\n'
+            'cliché = "this is a string with unicode, what a cliché"'
            )

    @classmethod
@@ -93,6 +94,8 @@ class HyperParserTest(unittest.TestCase):
        self.assertTrue(p.is_in_string())
        p = get('4.6')
        self.assertTrue(p.is_in_string())
+        p = get('12.54')
+        self.assertTrue(p.is_in_string())

    def test_is_in_code(self):
        get = self.get_parser
@@ -180,12 +183,91 @@ class HyperParserTest(unittest.TestCase):
        p = get('10.0')
        self.assertEqual(p.get_expression(), '')

+        p = get('10.6')
+        self.assertEqual(p.get_expression(), '')
+
+        p = get('10.11')
+        self.assertEqual(p.get_expression(), '')
+
        p = get('11.3')
        self.assertEqual(p.get_expression(), '')

        p = get('11.11')
        self.assertEqual(p.get_expression(), 'False')

+        p = get('12.6')
+        self.assertEqual(p.get_expression(), 'cliché')
+
+    def test_eat_identifier(self):
+        def is_valid_id(candidate):
+            result = HyperParser._eat_identifier(candidate, 0, len(candidate))
+            if result == len(candidate):
+                return True
+            elif result == 0:
+                return False
+            else:
+                err_msg = "Unexpected result: {} (expected 0 or {}".format(
+                    result, len(candidate)
+                )
+                raise Exception(err_msg)
+
+        # invalid first character which is valid elsewhere in an identifier
+        self.assertFalse(is_valid_id('2notid'))
+
+        # ASCII-only valid identifiers
+        self.assertTrue(is_valid_id('valid_id'))
+        self.assertTrue(is_valid_id('_valid_id'))
+        self.assertTrue(is_valid_id('valid_id_'))
+        self.assertTrue(is_valid_id('_2valid_id'))
+
+        # keywords which should be "eaten"
+        self.assertTrue(is_valid_id('True'))
+        self.assertTrue(is_valid_id('False'))
+        self.assertTrue(is_valid_id('None'))
+
+        # keywords which should not be "eaten"
+        self.assertFalse(is_valid_id('for'))
+        self.assertFalse(is_valid_id('import'))
+        self.assertFalse(is_valid_id('return'))
+
+        # valid unicode identifiers
+        self.assertTrue(is_valid_id('cliche'))
+        self.assertTrue(is_valid_id('cliché'))
+        self.assertTrue(is_valid_id('a٢'))
+
+        # invalid unicode identifiers
+        self.assertFalse(is_valid_id('2a'))
+        self.assertFalse(is_valid_id('٢a'))
+        self.assertFalse(is_valid_id('a²'))
+
+        # valid identifier after "punctuation"
+        self.assertEqual(HyperParser._eat_identifier('+ var', 0, 5), len('var'))
+        self.assertEqual(HyperParser._eat_identifier('+var', 0, 4), len('var'))
+        self.assertEqual(HyperParser._eat_identifier('.var', 0, 4), len('var'))
+
+        # invalid identifiers
+        self.assertFalse(is_valid_id('+'))
+        self.assertFalse(is_valid_id(' '))
+        self.assertFalse(is_valid_id(':'))
+        self.assertFalse(is_valid_id('?'))
+        self.assertFalse(is_valid_id('^'))
+        self.assertFalse(is_valid_id('\\'))
+        self.assertFalse(is_valid_id('"'))
+        self.assertFalse(is_valid_id('"a string"'))
+
+    def test_eat_identifier_various_lengths(self):
+        eat_id = HyperParser._eat_identifier
+
+        for length in range(1, 21):
+            self.assertEqual(eat_id('a' * length, 0, length), length)
+            self.assertEqual(eat_id('é' * length, 0, length), length)
+            self.assertEqual(eat_id('a' + '2' * (length - 1), 0, length), length)
+            self.assertEqual(eat_id('é' + '2' * (length - 1), 0, length), length)
+            self.assertEqual(eat_id('é' + 'a' * (length - 1), 0, length), length)
+            self.assertEqual(eat_id('é' * (length - 1) + 'a', 0, length), length)
+            self.assertEqual(eat_id('+' * length, 0, length), 0)
+            self.assertEqual(eat_id('2' + 'a' * (length - 1), 0, length), 0)
+            self.assertEqual(eat_id('2' + 'é' * (length - 1), 0, length), 0)

 if __name__ == '__main__':
    unittest.main(verbosity=2)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -264,6 +264,8 @@ Library
 - Issue #21455: Add a default backlog to socket.listen().

 - Issue #21525: Most Tkinter methods which accepted tuples now accept lists too.
+- Issue #21765: Add support for non-ascii identifiers to HyperParser.
+

 - Issue #10744: Fix PEP 3118 format strings on ctypes objects with a nontrivial
  shape.