Commit 47bb995d authored by Guido van Rossum's avatar Guido van Rossum

QueryParser.py:

- Rephrased the description of the grammar, pointing out that the
  lexicon decides on globbing syntax.

- Refactored term and atom parsing (moving atom parsing into a
  separate method).  The previously checked-in version accidentally
  accepted some invalid forms like ``foo AND -bar''; this is fixed.

tests/testQueryParser.py:

- Each test is now in a separate method; this produces more output
  (alas) but makes pinpointing the errors much simpler.

- Added some tests catching ``foo AND -bar'' and similar.

- Added an explicit test class for the handling of stopwords.  The
  "and/" test no longer has to check self.__class__.

- Some refactoring of the TestQueryParser class; the utility methods
  are now in a base class TestQueryParserBase, in a different order;
  compareParseTrees() now shows the parse tree it got when raising an
  exception.  The parser is now self.parser instead of self.p (see
  below).

tests/testZCTextIndex.py:

- setUp() no longer needs to assign to self.p; the parser is
  consistently called self.parser now.
parent 98607a5c
...@@ -27,17 +27,19 @@ The key words (AND, OR, NOT) are recognized in any mixture of case. ...@@ -27,17 +27,19 @@ The key words (AND, OR, NOT) are recognized in any mixture of case.
An ATOM is either: An ATOM is either:
+ A sequence of characters not containing whitespace or parentheses or + A sequence of characters not containing whitespace or parentheses or
double quotes, and not equal to one of the key words 'AND', 'OR', 'NOT'; or double quotes, and not equal (ignoring case) to one of the key words
'AND', 'OR', 'NOT'; or
+ A non-empty string enclosed in double quotes. The interior of the string + A non-empty string enclosed in double quotes. The interior of the
can contain whitespace, parentheses and key words. string can contain whitespace, parentheses and key words, but not
quotes.
In addition, an ATOM may optionally be preceded by a hyphen, meaning + A hyphen followed by one of the two forms above, meaning that it
that it must not be present. must not be present.
An unquoted ATOM may also end in a star. This is a primitive An unquoted ATOM may also contain globbing characters. Globbing
"globbing" function, meaning to search for any word with a given syntax is defined by the lexicon; for example "foo*" could mean any
prefix. word starting with "foo".
When multiple consecutive ATOMs are found at the leaf level, they are When multiple consecutive ATOMs are found at the leaf level, they are
connected by an implied AND operator, and an unquoted leading hyphen connected by an implied AND operator, and an unquoted leading hyphen
...@@ -202,32 +204,37 @@ class QueryParser: ...@@ -202,32 +204,37 @@ class QueryParser:
tree = self._parseOrExpr() tree = self._parseOrExpr()
self._require(_RPAREN) self._require(_RPAREN)
else: else:
atoms = [self._get(_ATOM)]
while self._peek(_ATOM):
atoms.append(self._get(_ATOM))
nodes = [] nodes = []
nots = [] nodes = [self._parseAtom()]
for a in atoms: while self._peek(_ATOM):
words = self._lexicon.parseTerms(a) nodes.append(self._parseAtom())
if not words: nodes = filter(None, nodes)
self._ignored.append(a)
continue # Only stopwords
if len(words) > 1:
n = ParseTree.PhraseNode(" ".join(words))
elif self._lexicon.isGlob(words[0]):
n = ParseTree.GlobNode(words[0])
else:
n = ParseTree.AtomNode(words[0])
if a[0] == "-":
n = ParseTree.NotNode(n)
nots.append(n)
else:
nodes.append(n)
if not nodes: if not nodes:
return None # Only stowords return None # Only stopwords
nodes.extend(nots) structure = [(isinstance(nodes[i], ParseTree.NotNode), i, nodes[i])
for i in range(len(nodes))]
structure.sort()
nodes = [node for (bit, index, node) in structure]
if isinstance(nodes[0], ParseTree.NotNode):
raise ParseTree.ParseError(
"a term must have at least one positive word")
if len(nodes) == 1: if len(nodes) == 1:
tree = nodes[0] return nodes[0]
else: tree = ParseTree.AndNode(nodes)
tree = ParseTree.AndNode(nodes) return tree
def _parseAtom(self):
term = self._get(_ATOM)
words = self._lexicon.parseTerms(term)
if not words:
self._ignored.append(term)
return None
if len(words) > 1:
tree = ParseTree.PhraseNode(words)
elif self._lexicon.isGlob(words[0]):
tree = ParseTree.GlobNode(words[0])
else:
tree = ParseTree.AtomNode(words[0])
if term[0] == "-":
tree = ParseTree.NotNode(tree)
return tree return tree
...@@ -21,116 +21,254 @@ from Products.ZCTextIndex.ParseTree import OrNode, AndNode, NotNode ...@@ -21,116 +21,254 @@ from Products.ZCTextIndex.ParseTree import OrNode, AndNode, NotNode
from Products.ZCTextIndex.ParseTree import AtomNode, PhraseNode, GlobNode from Products.ZCTextIndex.ParseTree import AtomNode, PhraseNode, GlobNode
from Products.ZCTextIndex.Lexicon import Lexicon, Splitter from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
class TestQueryParser(TestCase): class TestQueryParserBase(TestCase):
def compareParseTrees(self, got, expected): def setUp(self):
self.lexicon = Lexicon(Splitter())
self.parser = QueryParser(self.lexicon)
def expect(self, input, output, expected_ignored=[]):
tree = self.parser.parseQuery(input)
ignored = self.parser.getIgnored()
self.compareParseTrees(tree, output)
self.assertEqual(ignored, expected_ignored)
# Check that parseQueryEx() == (parseQuery(), getIgnored())
ex_tree, ex_ignored = self.parser.parseQueryEx(input)
self.compareParseTrees(ex_tree, tree)
self.assertEqual(ex_ignored, expected_ignored)
def failure(self, input):
self.assertRaises(ParseError, self.parser.parseQuery, input)
self.assertRaises(ParseError, self.parser.parseQueryEx, input)
def compareParseTrees(self, got, expected, msg=None):
if msg is None:
msg = repr(got)
self.assertEqual(isinstance(got, ParseTreeNode), 1) self.assertEqual(isinstance(got, ParseTreeNode), 1)
self.assertEqual(got.__class__, expected.__class__) self.assertEqual(got.__class__, expected.__class__, msg)
if isinstance(got, PhraseNode): if isinstance(got, PhraseNode):
self.assertEqual(got.nodeType(), "PHRASE") self.assertEqual(got.nodeType(), "PHRASE", msg)
self.assertEqual(got.getValue(), expected.getValue()) self.assertEqual(got.getValue(), expected.getValue(), msg)
elif isinstance(got, GlobNode): elif isinstance(got, GlobNode):
self.assertEqual(got.nodeType(), "GLOB") self.assertEqual(got.nodeType(), "GLOB", msg)
self.assertEqual(got.getValue(), expected.getValue()) self.assertEqual(got.getValue(), expected.getValue(), msg)
elif isinstance(got, AtomNode): elif isinstance(got, AtomNode):
self.assertEqual(got.nodeType(), "ATOM") self.assertEqual(got.nodeType(), "ATOM", msg)
self.assertEqual(got.getValue(), expected.getValue()) self.assertEqual(got.getValue(), expected.getValue(), msg)
elif isinstance(got, NotNode): elif isinstance(got, NotNode):
self.assertEqual(got.nodeType(), "NOT") self.assertEqual(got.nodeType(), "NOT")
self.compareParseTrees(got.getValue(), expected.getValue()) self.compareParseTrees(got.getValue(), expected.getValue(), msg)
elif isinstance(got, AndNode) or isinstance(got, OrNode): elif isinstance(got, AndNode) or isinstance(got, OrNode):
self.assertEqual(got.nodeType(), self.assertEqual(got.nodeType(),
isinstance(got, AndNode) and "AND" or "OR") isinstance(got, AndNode) and "AND" or "OR", msg)
list1 = got.getValue() list1 = got.getValue()
list2 = expected.getValue() list2 = expected.getValue()
self.assertEqual(len(list1), len(list2)) self.assertEqual(len(list1), len(list2), msg)
for i in range(len(list1)): for i in range(len(list1)):
self.compareParseTrees(list1[i], list2[i]) self.compareParseTrees(list1[i], list2[i], msg)
def expect(self, input, output): class TestQueryParser(TestQueryParserBase):
tree = self.p.parseQuery(input)
self.compareParseTrees(tree, output)
def failure(self, input): def test001(self):
self.assertRaises(ParseError, self.p.parseQuery, input)
def setUp(self):
self.lexicon = Lexicon(Splitter())
self.p = QueryParser(self.lexicon)
def testParseQuery(self):
self.expect("foo", AtomNode("foo")) self.expect("foo", AtomNode("foo"))
def test002(self):
self.expect("note", AtomNode("note")) self.expect("note", AtomNode("note"))
def test003(self):
self.expect("aa and bb AND cc", self.expect("aa and bb AND cc",
AndNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")])) AndNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
def test004(self):
self.expect("aa OR bb or cc", self.expect("aa OR bb or cc",
OrNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")])) OrNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
def test005(self):
self.expect("aa AND bb OR cc AnD dd", self.expect("aa AND bb OR cc AnD dd",
OrNode([AndNode([AtomNode("aa"), AtomNode("bb")]), OrNode([AndNode([AtomNode("aa"), AtomNode("bb")]),
AndNode([AtomNode("cc"), AtomNode("dd")])])) AndNode([AtomNode("cc"), AtomNode("dd")])]))
def test006(self):
self.expect("(aa OR bb) AND (cc OR dd)", self.expect("(aa OR bb) AND (cc OR dd)",
AndNode([OrNode([AtomNode("aa"), AtomNode("bb")]), AndNode([OrNode([AtomNode("aa"), AtomNode("bb")]),
OrNode([AtomNode("cc"), AtomNode("dd")])])) OrNode([AtomNode("cc"), AtomNode("dd")])]))
self.expect("aa AND not bb",
def test007(self):
self.expect("aa AND NOT bb",
AndNode([AtomNode("aa"), NotNode(AtomNode("bb"))])) AndNode([AtomNode("aa"), NotNode(AtomNode("bb"))]))
self.expect('"foo bar"', PhraseNode("foo bar")) def test010(self):
self.expect('"foo bar"', PhraseNode(["foo", "bar"]))
def test011(self):
self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")])) self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
self.expect('(("foo bar"))"', PhraseNode("foo bar")) def test012(self):
self.expect('(("foo bar"))"', PhraseNode(["foo", "bar"]))
def test013(self):
self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")])) self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))
if self.__class__ is TestQueryParser: def test014(self):
# This test fails when testZCTextIndex subclasses this class, self.expect("foo-bar", PhraseNode(["foo", "bar"]))
# because its lexicon's pipeline removes stopwords
self.expect('and/', AtomNode("and"))
self.expect("foo-bar", PhraseNode("foo bar")) def test015(self):
self.expect("foo -bar", AndNode([AtomNode("foo"), self.expect("foo -bar", AndNode([AtomNode("foo"),
NotNode(AtomNode("bar"))])) NotNode(AtomNode("bar"))]))
def test016(self):
self.expect("-foo bar", AndNode([AtomNode("bar"), self.expect("-foo bar", AndNode([AtomNode("bar"),
NotNode(AtomNode("foo"))])) NotNode(AtomNode("foo"))]))
def test017(self):
self.expect("booh -foo-bar", self.expect("booh -foo-bar",
AndNode([AtomNode("booh"), AndNode([AtomNode("booh"),
NotNode(PhraseNode("foo bar"))])) NotNode(PhraseNode(["foo", "bar"]))]))
def test018(self):
self.expect('booh -"foo bar"', self.expect('booh -"foo bar"',
AndNode([AtomNode("booh"), AndNode([AtomNode("booh"),
NotNode(PhraseNode("foo bar"))])) NotNode(PhraseNode(["foo", "bar"]))]))
def test019(self):
self.expect('foo"bar"', self.expect('foo"bar"',
AndNode([AtomNode("foo"), AtomNode("bar")])) AndNode([AtomNode("foo"), AtomNode("bar")]))
def test020(self):
self.expect('"foo"bar', self.expect('"foo"bar',
AndNode([AtomNode("foo"), AtomNode("bar")])) AndNode([AtomNode("foo"), AtomNode("bar")]))
def test021(self):
self.expect('foo"bar"blech', self.expect('foo"bar"blech',
AndNode([AtomNode("foo"), AtomNode("bar"), AndNode([AtomNode("foo"), AtomNode("bar"),
AtomNode("blech")])) AtomNode("blech")]))
def test022(self):
self.expect("foo*", GlobNode("foo*")) self.expect("foo*", GlobNode("foo*"))
def test023(self):
self.expect("foo* bar", AndNode([GlobNode("foo*"), self.expect("foo* bar", AndNode([GlobNode("foo*"),
AtomNode("bar")])) AtomNode("bar")]))
def testParseFailures(self): def test101(self):
self.failure("") self.failure("")
def test102(self):
self.failure("not") self.failure("not")
def test103(self):
self.failure("or")
def test104(self):
self.failure("and")
def test105(self):
self.failure("NOT")
def test106(self):
self.failure("OR") self.failure("OR")
def test107(self):
self.failure("AND") self.failure("AND")
self.failure("not foo")
def test108(self):
self.failure("NOT foo")
def test109(self):
self.failure(")") self.failure(")")
def test110(self):
self.failure("(") self.failure("(")
def test111(self):
self.failure("foo OR") self.failure("foo OR")
def test112(self):
self.failure("foo AND") self.failure("foo AND")
def test113(self):
self.failure("OR foo") self.failure("OR foo")
self.failure("and foo")
def test114(self):
self.failure("AND foo")
def test115(self):
self.failure("(foo) bar") self.failure("(foo) bar")
def test116(self):
self.failure("(foo OR)") self.failure("(foo OR)")
def test117(self):
self.failure("(foo AND)") self.failure("(foo AND)")
def test118(self):
self.failure("(NOT foo)") self.failure("(NOT foo)")
def test119(self):
self.failure("-foo") self.failure("-foo")
def test120(self):
self.failure("-foo -bar") self.failure("-foo -bar")
self.failure('""')
def test121(self):
self.failure("foo OR -bar")
def test122(self):
self.failure("foo AND -bar")
class StopWordTestQueryParser(TestQueryParserBase):
def setUp(self):
# Only 'stop' is a stopword (but 'and' is still an operator)
self.lexicon = Lexicon(Splitter(), FakeStopWordRemover())
self.parser = QueryParser(self.lexicon)
def test201(self):
self.expect('and/', AtomNode("and"))
def test202(self):
self.expect('foo AND stop', AtomNode("foo"), ["stop"])
def test203(self):
self.expect('foo AND NOT stop', AtomNode("foo"), ["stop"])
def test204(self):
self.expect('stop AND foo', AtomNode("foo"), ["stop"])
def test205(self):
self.expect('foo OR stop', AtomNode("foo"), ["stop"])
def test206(self):
self.expect('stop OR foo', AtomNode("foo"), ["stop"])
def test301(self):
self.failure('stop')
def test302(self):
self.failure('stop stop')
def test303(self):
self.failure('stop AND stop')
def test304(self):
self.failure('stop OR stop')
def test305(self):
self.failure('stop -foo')
def test306(self):
self.failure('stop AND NOT foo')
class FakeStopWordRemover:
def process(self, list):
return [word for word in list if word != "stop"]
def test_suite(): def test_suite():
return makeSuite(TestQueryParser) return TestSuite((makeSuite(TestQueryParser),
makeSuite(StopWordTestQueryParser),
))
if __name__=="__main__": if __name__=="__main__":
main(defaultTest='test_suite') main(defaultTest='test_suite')
...@@ -454,7 +454,7 @@ class QueryTestsBase(testQueryEngine.TestQueryEngine, ...@@ -454,7 +454,7 @@ class QueryTestsBase(testQueryEngine.TestQueryEngine,
StopWordRemover()) StopWordRemover())
caller = LexiconHolder(self.lexicon) caller = LexiconHolder(self.lexicon)
self.zc_index = ZCTextIndex('name', extra, caller, self.IndexFactory) self.zc_index = ZCTextIndex('name', extra, caller, self.IndexFactory)
self.p = self.parser = QueryParser(self.lexicon) self.parser = QueryParser(self.lexicon)
self.index = self.zc_index.index self.index = self.zc_index.index
self.add_docs() self.add_docs()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment