QueryParser.py:

- Rephrased the description of the grammar, pointing out that the lexicon decides on globbing syntax. - Refactored term and atom parsing (moving atom parsing into a separate method). The previously checked-in version accidentally accepted some invalid forms like ``foo AND -bar''; this is fixed. tests/testQueryParser.py: - Each test is now in a separate method; this produces more output (alas) but makes pinpointing the errors much simpler. - Added some tests catching ``foo AND -bar'' and similar. - Added an explicit test class for the handling of stopwords. The "and/" test no longer has to check self.__class__. - Some refactoring of the TestQueryParser class; the utility methods are now in a base class TestQueryParserBase, in a different order; compareParseTrees() now shows the parse tree it got when raising an exception. The parser is now self.parser instead of self.p (see below). tests/testZCTextIndex.py: - setUp() no longer needs to assign to self.p; the parser is consistently called self.parser now.

QueryParser.py:
- Rephrased the description of the grammar, pointing out that the lexicon decides on globbing syntax. - Refactored term and atom parsing (moving atom parsing into a separate method). The previously checked-in version accidentally accepted some invalid forms like ``foo AND -bar''; this is fixed. tests/testQueryParser.py: - Each test is now in a separate method; this produces more output (alas) but makes pinpointing the errors much simpler. - Added some tests catching ``foo AND -bar'' and similar. - Added an explicit test class for the handling of stopwords. The "and/" test no longer has to check self.__class__. - Some refactoring of the TestQueryParser class; the utility methods are now in a base class TestQueryParserBase, in a different order; compareParseTrees() now shows the parse tree it got when raising an exception. The parser is now self.parser instead of self.p (see below). tests/testZCTextIndex.py: - setUp() no longer needs to assign to self.p; the parser is consistently called self.parser now.
47bb995d · Guido van Rossum · 98607a5c · 47bb995d · 47bb995d · 47bb995d
Commit 47bb995d authored May 20, 2002 by Guido van Rossum
3 changed files
--- a/lib/python/Products/ZCTextIndex/QueryParser.py
+++ b/lib/python/Products/ZCTextIndex/QueryParser.py
@@ -27,17 +27,19 @@ The key words (AND, OR, NOT) are recognized in any mixture of case.
 An ATOM is either:
 + A sequence of characters not containing whitespace or parentheses or
-  double quotes, and not equal to one of the key words 'AND', 'OR', 'NOT'; or
+  double quotes, and not equal (ignoring case) to one of the key words
+  'AND', 'OR', 'NOT'; or
-+ A non-empty string enclosed in double quotes.  The interior of the string
+ A non-empty string enclosed in double quotes.  The interior of the
-  can contain whitespace, parentheses and key words.
+  string can contain whitespace, parentheses and key words, but not
+  quotes.
-In addition, an ATOM may optionally be preceded by a hyphen, meaning
+ A hyphen followed by one of the two forms above, meaning that it
-that it must not be present.
+  must not be present.
-An unquoted ATOM may also end in a star.  This is a primitive
+An unquoted ATOM may also contain globbing characters.  Globbing
-"globbing" function, meaning to search for any word with a given
+syntax is defined by the lexicon; for example "foo*" could mean any
-prefix.
+word starting with "foo".
 When multiple consecutive ATOMs are found at the leaf level, they are
 connected by an implied AND operator, and an unquoted leading hyphen
@@ -202,32 +204,37 @@ class QueryParser:
            tree = self._parseOrExpr()
            self._require(_RPAREN)
        else:
-            atoms = [self._get(_ATOM)]
-            while self._peek(_ATOM):
-                atoms.append(self._get(_ATOM))
            nodes = []
-            nots = []
+            nodes = [self._parseAtom()]
-            for a in atoms:
+            while self._peek(_ATOM):
-                words = self._lexicon.parseTerms(a)
+                nodes.append(self._parseAtom())
-                if not words:
+            nodes = filter(None, nodes)
-                    self._ignored.append(a)
-                    continue # Only stopwords
-                if len(words) > 1:
-                    n = ParseTree.PhraseNode(" ".join(words))
-                elif self._lexicon.isGlob(words[0]):
-                    n = ParseTree.GlobNode(words[0])
-                else:
-                    n = ParseTree.AtomNode(words[0])
-                if a[0] == "-":
-                    n = ParseTree.NotNode(n)
-                    nots.append(n)
-                else:
-                    nodes.append(n)
            if not nodes:
-                return None # Only stowords
+                return None # Only stopwords
-            nodes.extend(nots)
+            structure = [(isinstance(nodes[i], ParseTree.NotNode), i, nodes[i])
+                         for i in range(len(nodes))]
+            structure.sort()
+            nodes = [node for (bit, index, node) in structure]
+            if isinstance(nodes[0], ParseTree.NotNode):
+                raise ParseTree.ParseError(
+                    "a term must have at least one positive word")
            if len(nodes) == 1:
-                tree = nodes[0]
+                return nodes[0]
-            else:
+            tree = ParseTree.AndNode(nodes)
-                tree = ParseTree.AndNode(nodes)
+        return tree
+    def _parseAtom(self):
+        term = self._get(_ATOM)
+        words = self._lexicon.parseTerms(term)
+        if not words:
+            self._ignored.append(term)
+            return None
+        if len(words) > 1:
+            tree = ParseTree.PhraseNode(words)
+        elif self._lexicon.isGlob(words[0]):
+            tree = ParseTree.GlobNode(words[0])
+        else:
+            tree = ParseTree.AtomNode(words[0])
+        if term[0] == "-":
+            tree = ParseTree.NotNode(tree)
        return tree
--- a/lib/python/Products/ZCTextIndex/tests/testQueryParser.py
+++ b/lib/python/Products/ZCTextIndex/tests/testQueryParser.py
@@ -21,116 +21,254 @@ from Products.ZCTextIndex.ParseTree import OrNode, AndNode, NotNode
 from Products.ZCTextIndex.ParseTree import AtomNode, PhraseNode, GlobNode
 from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
-class TestQueryParser(TestCase):
+class TestQueryParserBase(TestCase):
-    def compareParseTrees(self, got, expected):
+    def setUp(self):
+        self.lexicon = Lexicon(Splitter())
+        self.parser = QueryParser(self.lexicon)
+    def expect(self, input, output, expected_ignored=[]):
+        tree = self.parser.parseQuery(input)
+        ignored = self.parser.getIgnored()
+        self.compareParseTrees(tree, output)
+        self.assertEqual(ignored, expected_ignored)
+        # Check that parseQueryEx() == (parseQuery(), getIgnored())
+        ex_tree, ex_ignored = self.parser.parseQueryEx(input)
+        self.compareParseTrees(ex_tree, tree)
+        self.assertEqual(ex_ignored, expected_ignored)
+    def failure(self, input):
+        self.assertRaises(ParseError, self.parser.parseQuery, input)
+        self.assertRaises(ParseError, self.parser.parseQueryEx, input)
+    def compareParseTrees(self, got, expected, msg=None):
+        if msg is None:
+            msg = repr(got)
        self.assertEqual(isinstance(got, ParseTreeNode), 1)
-        self.assertEqual(got.__class__, expected.__class__)
+        self.assertEqual(got.__class__, expected.__class__, msg)
        if isinstance(got, PhraseNode):
-            self.assertEqual(got.nodeType(), "PHRASE")
+            self.assertEqual(got.nodeType(), "PHRASE", msg)
-            self.assertEqual(got.getValue(), expected.getValue())
+            self.assertEqual(got.getValue(), expected.getValue(), msg)
        elif isinstance(got, GlobNode):
-            self.assertEqual(got.nodeType(), "GLOB")
+            self.assertEqual(got.nodeType(), "GLOB", msg)
-            self.assertEqual(got.getValue(), expected.getValue())
+            self.assertEqual(got.getValue(), expected.getValue(), msg)
        elif isinstance(got, AtomNode):
-            self.assertEqual(got.nodeType(), "ATOM")
+            self.assertEqual(got.nodeType(), "ATOM", msg)
-            self.assertEqual(got.getValue(), expected.getValue())
+            self.assertEqual(got.getValue(), expected.getValue(), msg)
        elif isinstance(got, NotNode):
            self.assertEqual(got.nodeType(), "NOT")
-            self.compareParseTrees(got.getValue(), expected.getValue())
+            self.compareParseTrees(got.getValue(), expected.getValue(), msg)
        elif isinstance(got, AndNode) or isinstance(got, OrNode):
            self.assertEqual(got.nodeType(),
-                             isinstance(got, AndNode) and "AND" or "OR")
+                             isinstance(got, AndNode) and "AND" or "OR", msg)
            list1 = got.getValue()
            list2 = expected.getValue()
-            self.assertEqual(len(list1), len(list2))
+            self.assertEqual(len(list1), len(list2), msg)
            for i in range(len(list1)):
-                self.compareParseTrees(list1[i], list2[i])
+                self.compareParseTrees(list1[i], list2[i], msg)
-    def expect(self, input, output):
+class TestQueryParser(TestQueryParserBase):
-        tree = self.p.parseQuery(input)
-        self.compareParseTrees(tree, output)
-    def failure(self, input):
+    def test001(self):
-        self.assertRaises(ParseError, self.p.parseQuery, input)
-    def setUp(self):
-        self.lexicon = Lexicon(Splitter())
-        self.p = QueryParser(self.lexicon)
-    def testParseQuery(self):
        self.expect("foo", AtomNode("foo"))
+    def test002(self):
        self.expect("note", AtomNode("note"))
+    def test003(self):
        self.expect("aa and bb AND cc",
                    AndNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
+    def test004(self):
        self.expect("aa OR bb or cc",
                    OrNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
+    def test005(self):
        self.expect("aa AND bb OR cc AnD dd",
                    OrNode([AndNode([AtomNode("aa"), AtomNode("bb")]),
                            AndNode([AtomNode("cc"), AtomNode("dd")])]))
+    def test006(self):
        self.expect("(aa OR bb) AND (cc OR dd)",
                    AndNode([OrNode([AtomNode("aa"), AtomNode("bb")]),
                             OrNode([AtomNode("cc"), AtomNode("dd")])]))
-        self.expect("aa AND not bb",
+    def test007(self):
+        self.expect("aa AND NOT bb",
                    AndNode([AtomNode("aa"), NotNode(AtomNode("bb"))]))
-        self.expect('"foo bar"', PhraseNode("foo bar"))
+    def test010(self):
+        self.expect('"foo bar"', PhraseNode(["foo", "bar"]))
+    def test011(self):
        self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
-        self.expect('(("foo bar"))"', PhraseNode("foo bar"))
+    def test012(self):
+        self.expect('(("foo bar"))"', PhraseNode(["foo", "bar"]))
+    def test013(self):
        self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))
-        if self.__class__ is TestQueryParser:
+    def test014(self):
-            # This test fails when testZCTextIndex subclasses this class,
+        self.expect("foo-bar", PhraseNode(["foo", "bar"]))
-            # because its lexicon's pipeline removes stopwords
-            self.expect('and/', AtomNode("and"))
-        self.expect("foo-bar", PhraseNode("foo bar"))
+    def test015(self):
        self.expect("foo -bar", AndNode([AtomNode("foo"),
                                         NotNode(AtomNode("bar"))]))
+    def test016(self):
        self.expect("-foo bar", AndNode([AtomNode("bar"),
                                         NotNode(AtomNode("foo"))]))
+    def test017(self):
        self.expect("booh -foo-bar",
                    AndNode([AtomNode("booh"),
-                             NotNode(PhraseNode("foo bar"))]))
+                             NotNode(PhraseNode(["foo", "bar"]))]))
+    def test018(self):
        self.expect('booh -"foo bar"',
                    AndNode([AtomNode("booh"),
-                             NotNode(PhraseNode("foo bar"))]))
+                             NotNode(PhraseNode(["foo", "bar"]))]))
+    def test019(self):
        self.expect('foo"bar"',
                    AndNode([AtomNode("foo"), AtomNode("bar")]))
+    def test020(self):
        self.expect('"foo"bar',
                    AndNode([AtomNode("foo"), AtomNode("bar")]))
+    def test021(self):
        self.expect('foo"bar"blech',
                    AndNode([AtomNode("foo"), AtomNode("bar"),
                             AtomNode("blech")]))
+    def test022(self):
        self.expect("foo*", GlobNode("foo*"))
+    def test023(self):
        self.expect("foo* bar", AndNode([GlobNode("foo*"),
                                         AtomNode("bar")]))
-    def testParseFailures(self):
+    def test101(self):
        self.failure("")
+    def test102(self):
        self.failure("not")
+    def test103(self):
+        self.failure("or")
+    def test104(self):
+        self.failure("and")
+    def test105(self):
+        self.failure("NOT")
+    def test106(self):
        self.failure("OR")
+    def test107(self):
        self.failure("AND")
-        self.failure("not foo")
+    def test108(self):
+        self.failure("NOT foo")
+    def test109(self):
        self.failure(")")
+    def test110(self):
        self.failure("(")
+    def test111(self):
        self.failure("foo OR")
+    def test112(self):
        self.failure("foo AND")
+    def test113(self):
        self.failure("OR foo")
-        self.failure("and foo")
+    def test114(self):
+        self.failure("AND foo")
+    def test115(self):
        self.failure("(foo) bar")
+    def test116(self):
        self.failure("(foo OR)")
+    def test117(self):
        self.failure("(foo AND)")
+    def test118(self):
        self.failure("(NOT foo)")
+    def test119(self):
        self.failure("-foo")
+    def test120(self):
        self.failure("-foo -bar")
-        self.failure('""')
+    def test121(self):
+        self.failure("foo OR -bar")
+    def test122(self):
+        self.failure("foo AND -bar")
+class StopWordTestQueryParser(TestQueryParserBase):
+    def setUp(self):
+        # Only 'stop' is a stopword (but 'and' is still an operator)
+        self.lexicon = Lexicon(Splitter(), FakeStopWordRemover())
+        self.parser = QueryParser(self.lexicon)
+    def test201(self):
+        self.expect('and/', AtomNode("and"))
+    def test202(self):
+        self.expect('foo AND stop', AtomNode("foo"), ["stop"])
+    def test203(self):
+        self.expect('foo AND NOT stop', AtomNode("foo"), ["stop"])
+    def test204(self):
+        self.expect('stop AND foo', AtomNode("foo"), ["stop"])
+    def test205(self):
+        self.expect('foo OR stop', AtomNode("foo"), ["stop"])
+    def test206(self):
+        self.expect('stop OR foo', AtomNode("foo"), ["stop"])
+    def test301(self):
+        self.failure('stop')
+    def test302(self):
+        self.failure('stop stop')
+    def test303(self):
+        self.failure('stop AND stop')
+    def test304(self):
+        self.failure('stop OR stop')
+    def test305(self):
+        self.failure('stop -foo')
+    def test306(self):
+        self.failure('stop AND NOT foo')
+class FakeStopWordRemover:
+    def process(self, list):
+        return [word for word in list if word != "stop"]
 def test_suite():
-    return makeSuite(TestQueryParser)
+    return TestSuite((makeSuite(TestQueryParser),
+                      makeSuite(StopWordTestQueryParser),
+                    ))
 if __name__=="__main__":
    main(defaultTest='test_suite')
--- a/lib/python/Products/ZCTextIndex/tests/testZCTextIndex.py
+++ b/lib/python/Products/ZCTextIndex/tests/testZCTextIndex.py
@@ -454,7 +454,7 @@ class QueryTestsBase(testQueryEngine.TestQueryEngine,
                               StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        self.zc_index = ZCTextIndex('name', extra, caller, self.IndexFactory)
-        self.p = self.parser = QueryParser(self.lexicon)
+        self.parser = QueryParser(self.lexicon)
        self.index = self.zc_index.index
        self.add_docs()