Commit b82b2746 authored by Guido van Rossum's avatar Guido van Rossum

Refactor the query parser to rely on the lexicon for parsing terms.

ILexicon.py:

  - Added parseTerms() and isGlob().

  - Added get_word(), get_wid() (get_word() is old; get_wid() for symmetry).

  - Reflowed some text.

IQueryParser.py:

  - Expanded docs for parseQuery().

  - Added getIgnored() and parseQueryEx().

IPipelineElement.py:

  - Added processGlob().

Lexicon.py:

  - Added parseTerms() and isGlob().

  - Added get_wid().

  - Some pipeline elements now support processGlob().

ParseTree.py:

  - Clarified the error message for calling executeQuery() on a
    NotNode.

QueryParser.py (lots of changes):

  - Change private names __tokens etc. into protected _tokens etc.

  - Add getIgnored() and parseQueryEx() methods.

  - The atom parser now uses the lexicon's parseTerms() and isGlob()
    methods.

  - Query parts that consist only of stopwords (as determined by the
    lexicon), or of stopwords and negated terms, yield None instead of
    a parse tree node; the ignored term is added to self._ignored.
    None is ignored when combining terms for AND/OR/NOT operators, and
    when an operator has no non-None operands, the operator itself
    returns None.  When this None percolates all the way to the top,
    the parser raises a ParseError exception.

tests/testQueryParser.py:

  - Changed test expressions of the form "a AND b AND c" to "aa AND bb
    AND cc" so that the terms won't be considered stopwords.

  - The test for "and/" can only work for the base class.

tests/testZCTextIndex.py:

  - Added copyright notice.

  - Refactor testStopWords() to have two helpers, one for success, one
    for failures.

  - Change testStopWords() to require parser failure for those queries
    that have only stopwords or stopwords plus negated terms.

  - Improve compareSet() to sort the sets of keys, and use a more
    direct way of extracting the keys.  This wasn't strictly needed
    (nothing fails without this), but the old approach of copying the
    keys into a dict in a loop depends on the dict hashing to always
    return keys in the same order.
parent 5f66a3ce
...@@ -22,8 +22,8 @@ class ILexicon(Interface): ...@@ -22,8 +22,8 @@ class ILexicon(Interface):
The input text may be either a string or a list of strings. The input text may be either a string or a list of strings.
Parses the text as if they are search terms, and skips words that Parse the text as if they are search terms, and skips words
aren't in the lexicon. that aren't in the lexicon.
""" """
def sourceToWordIds(text): def sourceToWordIds(text):
...@@ -31,8 +31,9 @@ class ILexicon(Interface): ...@@ -31,8 +31,9 @@ class ILexicon(Interface):
The input text may be either a string or a list of strings. The input text may be either a string or a list of strings.
Parses the text as if they come from a source document, and creates Parse the text as if they come from a source document, and
new word ids for words that aren't (yet) in the lexicon. creates new word ids for words that aren't (yet) in the
lexicon.
""" """
def globToWordIds(pattern): def globToWordIds(pattern):
...@@ -43,9 +44,34 @@ class ILexicon(Interface): ...@@ -43,9 +44,34 @@ class ILexicon(Interface):
NOTE: Currently only a single trailing * is supported. NOTE: Currently only a single trailing * is supported.
Returns the wids for all words in the lexicon that match the Return the wids for all words in the lexicon that match the
pattern. pattern.
""" """
def length(): def length():
"""Return the number of unique term in the lexicon.""" """Return the number of unique term in the lexicon."""
def get_word(wid):
"""Return the word for the given word id.
Raise KeyError if the word id is not in the lexicon.
"""
def get_wid(word):
"""Return the wird id for the given word.
Return 0 of the word is not in the lexicon.
"""
def parseTerms(text):
"""Pass the text through the pipeline.
Return a list of words, normalized by the pipeline
(e.g. stopwords removed, case normalized etc.).
"""
def isGlob(word):
"""Return true if the word is a globbing pattern.
The word should be one of the words returned by parseTerm().
"""
...@@ -21,3 +21,9 @@ class IPipelineElement(Interface): ...@@ -21,3 +21,9 @@ class IPipelineElement(Interface):
Process a source sequence of words into a result sequence. Process a source sequence of words into a result sequence.
""" """
def processGlob(source):
"""Process, passing through globbing metacharaters.
This is an optional method; if it is not used, process() is used.
"""
...@@ -24,6 +24,31 @@ class IQueryParser(Interface.Base): ...@@ -24,6 +24,31 @@ class IQueryParser(Interface.Base):
Return a parse tree (which implements IQueryParseTree). Return a parse tree (which implements IQueryParseTree).
Some of the query terms may be ignored because they are
stopwords; use getIgnored() to find out which terms were
ignored. But if the entire query consists only of stop words,
or of stopwords and one or more negated terms, an exception is
raised.
May raise ParseTree.ParseError.
"""
def getIgnored():
"""Return the list of ignored terms.
Return the list of terms that were ignored by the most recent
call to parseQuery() because they were stopwords.
If parseQuery() was never called this returns None.
"""
def parseQueryEx(query):
"""Parse a query string.
Return a tuple (tree, ignored) where 'tree' is the parse tree
as returned by parseQuery(), and 'ignored' is a list of
ignored terms as returned by getIgnored().
May raise ParseTree.ParseError. May raise ParseTree.ParseError.
""" """
......
...@@ -69,10 +69,22 @@ class Lexicon: ...@@ -69,10 +69,22 @@ class Lexicon:
wids.append(self._wids.get(word, 0)) wids.append(self._wids.get(word, 0))
return wids return wids
def parseTerms(self, text):
last = _text2list(text)
for element in self._pipeline:
process = getattr(element, "processGlob", element.process)
last = process(last)
return last
def isGlob(self, word):
return "*" in word
def get_word(self, wid): def get_word(self, wid):
"""Return the word for the given word id"""
return self._words[wid] return self._words[wid]
def get_wid(self, word):
return self._wids.get(word, 0)
def globToWordIds(self, pattern): def globToWordIds(self, pattern):
if not re.match("^\w+\*$", pattern): if not re.match("^\w+\*$", pattern):
return [] return []
...@@ -116,6 +128,7 @@ class Splitter: ...@@ -116,6 +128,7 @@ class Splitter:
import re import re
rx = re.compile(r"\w+") rx = re.compile(r"\w+")
rxGlob = re.compile(r"\w+\*?")
def process(self, lst): def process(self, lst):
result = [] result = []
...@@ -123,6 +136,12 @@ class Splitter: ...@@ -123,6 +136,12 @@ class Splitter:
result += self.rx.findall(s) result += self.rx.findall(s)
return result return result
def processGlob(self, lst):
result = []
for s in lst:
result += self.rxGlob.findall(s)
return result
class CaseNormalizer: class CaseNormalizer:
def process(self, lst): def process(self, lst):
......
...@@ -58,7 +58,7 @@ class NotNode(ParseTreeNode): ...@@ -58,7 +58,7 @@ class NotNode(ParseTreeNode):
return [] return []
def executeQuery(self, index): def executeQuery(self, index):
raise QueryError, "NOT operator must occur right after AND" raise QueryError, "NOT parse tree node cannot be executed directly"
class AndNode(ParseTreeNode): class AndNode(ParseTreeNode):
......
...@@ -83,7 +83,7 @@ _tokenizer_regex = re.compile(r""" ...@@ -83,7 +83,7 @@ _tokenizer_regex = re.compile(r"""
| -? | -?
# followed by # followed by
(?: (?:
# a string # a string inside double quotes (and not containing these)
" [^"]* " " [^"]* "
# or a non-empty stretch w/o whitespace, parens or double quotes # or a non-empty stretch w/o whitespace, parens or double quotes
| [^()\s"]+ | [^()\s"]+
...@@ -92,46 +92,64 @@ _tokenizer_regex = re.compile(r""" ...@@ -92,46 +92,64 @@ _tokenizer_regex = re.compile(r"""
class QueryParser: class QueryParser:
# This class is not thread-safe;
# each thread should have its own instance
def __init__(self, lexicon): def __init__(self, lexicon):
self._lexicon = lexicon self._lexicon = lexicon
self._ignored = None
# Public API methods
def parseQuery(self, query): def parseQuery(self, query):
# Lexical analysis. # Lexical analysis.
tokens = _tokenizer_regex.findall(query) tokens = _tokenizer_regex.findall(query)
self.__tokens = tokens self._tokens = tokens
# classify tokens # classify tokens
self.__tokentypes = [_keywords.get(token.upper(), _ATOM) self._tokentypes = [_keywords.get(token.upper(), _ATOM)
for token in tokens] for token in tokens]
# add _EOF # add _EOF
self.__tokens.append(_EOF) self._tokens.append(_EOF)
self.__tokentypes.append(_EOF) self._tokentypes.append(_EOF)
self.__index = 0 self._index = 0
# Syntactical analysis. # Syntactical analysis.
self._ignored = [] # Ignored words in the query, for parseQueryEx
tree = self._parseOrExpr() tree = self._parseOrExpr()
self._require(_EOF) self._require(_EOF)
if tree is None:
raise ParseTree.ParseError(
"Query contains only common words: %s" % repr(query))
return tree return tree
def getIgnored(self):
return self._ignored
def parseQueryEx(self, query):
tree = self.parseQueryEx(query)
ignored = self.getIgnored()
return tree, ignored
# Recursive descent parser # Recursive descent parser
def _require(self, tokentype): def _require(self, tokentype):
if not self._check(tokentype): if not self._check(tokentype):
t = self.__tokens[self.__index] t = self._tokens[self._index]
msg = "Token %r required, %r found" % (tokentype, t) msg = "Token %r required, %r found" % (tokentype, t)
raise ParseTree.ParseError, msg raise ParseTree.ParseError, msg
def _check(self, tokentype): def _check(self, tokentype):
if self.__tokentypes[self.__index] is tokentype: if self._tokentypes[self._index] is tokentype:
self.__index += 1 self._index += 1
return 1 return 1
else: else:
return 0 return 0
def _peek(self, tokentype): def _peek(self, tokentype):
return self.__tokentypes[self.__index] is tokentype return self._tokentypes[self._index] is tokentype
def _get(self, tokentype): def _get(self, tokentype):
t = self.__tokens[self.__index] t = self._tokens[self._index]
self._require(tokentype) self._require(tokentype)
return t return t
...@@ -140,16 +158,31 @@ class QueryParser: ...@@ -140,16 +158,31 @@ class QueryParser:
L.append(self._parseAndExpr()) L.append(self._parseAndExpr())
while self._check(_OR): while self._check(_OR):
L.append(self._parseAndExpr()) L.append(self._parseAndExpr())
if len(L) == 1: L = filter(None, L)
if not L:
return None # Only stopwords
elif len(L) == 1:
return L[0] return L[0]
else: else:
return ParseTree.OrNode(L) return ParseTree.OrNode(L)
def _parseAndExpr(self): def _parseAndExpr(self):
L = [] L = []
L.append(self._parseTerm()) t = self._parseTerm()
if t is not None:
L.append(t)
Nots = []
while self._check(_AND): while self._check(_AND):
L.append(self._parseNotExpr()) t = self._parseNotExpr()
if t is None:
continue
if isinstance(t, ParseTree.NotNode):
Nots.append(t)
else:
L.append(t)
if not L:
return None # Only stopwords
L.extend(Nots)
if len(L) == 1: if len(L) == 1:
return L[0] return L[0]
else: else:
...@@ -157,7 +190,10 @@ class QueryParser: ...@@ -157,7 +190,10 @@ class QueryParser:
def _parseNotExpr(self): def _parseNotExpr(self):
if self._check(_NOT): if self._check(_NOT):
return ParseTree.NotNode(self._parseTerm()) t = self._parseTerm()
if t is None:
return None # Only stopwords
return ParseTree.NotNode(t)
else: else:
return self._parseTerm() return self._parseTerm()
...@@ -172,12 +208,13 @@ class QueryParser: ...@@ -172,12 +208,13 @@ class QueryParser:
nodes = [] nodes = []
nots = [] nots = []
for a in atoms: for a in atoms:
words = re.findall(r"\w+\*?", a) words = self._lexicon.parseTerms(a)
if not words: if not words:
continue self._ignored.append(a)
continue # Only stopwords
if len(words) > 1: if len(words) > 1:
n = ParseTree.PhraseNode(" ".join(words)) n = ParseTree.PhraseNode(" ".join(words))
elif words[0].endswith("*"): elif self._lexicon.isGlob(words[0]):
n = ParseTree.GlobNode(words[0]) n = ParseTree.GlobNode(words[0])
else: else:
n = ParseTree.AtomNode(words[0]) n = ParseTree.AtomNode(words[0])
...@@ -187,9 +224,7 @@ class QueryParser: ...@@ -187,9 +224,7 @@ class QueryParser:
else: else:
nodes.append(n) nodes.append(n)
if not nodes: if not nodes:
text = " ".join(atoms) return None # Only stowords
msg = "At least one positive term required: %r" % text
raise ParseTree.ParseError, msg
nodes.extend(nots) nodes.extend(nots)
if len(nodes) == 1: if len(nodes) == 1:
tree = nodes[0] tree = nodes[0]
......
...@@ -61,18 +61,18 @@ class TestQueryParser(TestCase): ...@@ -61,18 +61,18 @@ class TestQueryParser(TestCase):
def testParseQuery(self): def testParseQuery(self):
self.expect("foo", AtomNode("foo")) self.expect("foo", AtomNode("foo"))
self.expect("note", AtomNode("note")) self.expect("note", AtomNode("note"))
self.expect("a and b AND c", self.expect("aa and bb AND cc",
AndNode([AtomNode("a"), AtomNode("b"), AtomNode("c")])) AndNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
self.expect("a OR b or c", self.expect("aa OR bb or cc",
OrNode([AtomNode("a"), AtomNode("b"), AtomNode("c")])) OrNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
self.expect("a AND b OR c AnD d", self.expect("aa AND bb OR cc AnD dd",
OrNode([AndNode([AtomNode("a"), AtomNode("b")]), OrNode([AndNode([AtomNode("aa"), AtomNode("bb")]),
AndNode([AtomNode("c"), AtomNode("d")])])) AndNode([AtomNode("cc"), AtomNode("dd")])]))
self.expect("(a OR b) AND (c OR d)", self.expect("(aa OR bb) AND (cc OR dd)",
AndNode([OrNode([AtomNode("a"), AtomNode("b")]), AndNode([OrNode([AtomNode("aa"), AtomNode("bb")]),
OrNode([AtomNode("c"), AtomNode("d")])])) OrNode([AtomNode("cc"), AtomNode("dd")])]))
self.expect("a AND not b", self.expect("aa AND not bb",
AndNode([AtomNode("a"), NotNode(AtomNode("b"))])) AndNode([AtomNode("aa"), NotNode(AtomNode("bb"))]))
self.expect('"foo bar"', PhraseNode("foo bar")) self.expect('"foo bar"', PhraseNode("foo bar"))
self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")])) self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
...@@ -80,7 +80,10 @@ class TestQueryParser(TestCase): ...@@ -80,7 +80,10 @@ class TestQueryParser(TestCase):
self.expect('(("foo bar"))"', PhraseNode("foo bar")) self.expect('(("foo bar"))"', PhraseNode("foo bar"))
self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")])) self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))
self.expect('and/', AtomNode("and")) if self.__class__ is TestQueryParser:
# This test fails when testZCTextIndex subclasses this class,
# because its lexicon's pipeline removes stopwords
self.expect('and/', AtomNode("and"))
self.expect("foo-bar", PhraseNode("foo bar")) self.expect("foo-bar", PhraseNode("foo bar"))
self.expect("foo -bar", AndNode([AtomNode("foo"), self.expect("foo -bar", AndNode([AtomNode("foo"),
......
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
from Products.ZCTextIndex.tests \ from Products.ZCTextIndex.tests \
import testIndex, testQueryEngine, testQueryParser import testIndex, testQueryEngine, testQueryParser
...@@ -9,6 +23,7 @@ from Products.ZCTextIndex.Lexicon import Lexicon, Splitter ...@@ -9,6 +23,7 @@ from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
from Products.ZCTextIndex.QueryParser import QueryParser from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.StopDict import get_stopdict from Products.ZCTextIndex.StopDict import get_stopdict
from Products.ZCTextIndex.ParseTree import ParseError
import re import re
import unittest import unittest
...@@ -84,6 +99,15 @@ class ZCIndexTestsBase: ...@@ -84,6 +99,15 @@ class ZCIndexTestsBase:
self.index = self.zc_index.index self.index = self.zc_index.index
self.lexicon = self.zc_index.lexicon self.lexicon = self.zc_index.lexicon
def parserFailure(self, query):
self.assertRaises(ParseError, self.zc_index.query, query)
def parserSuccess(self, query, n):
r, num = self.zc_index.query(query)
self.assertEqual(num, n)
if n:
self.assertEqual(r[0][0], 1)
def testStopWords(self): def testStopWords(self):
# the only non-stopword is question # the only non-stopword is question
text = ("to be or not to be " text = ("to be or not to be "
...@@ -96,61 +120,23 @@ class ZCIndexTestsBase: ...@@ -96,61 +120,23 @@ class ZCIndexTestsBase:
self.assertEqual(wids, []) self.assertEqual(wids, [])
self.assertEqual(len(self.index.get_words(1)), 1) self.assertEqual(len(self.index.get_words(1)), 1)
r, num = self.zc_index.query('question') self.parserSuccess('question', 1)
self.assertEqual(num, 1) self.parserSuccess('question AND to AND be', 1)
self.assertEqual(r[0][0], 1) self.parserSuccess('to AND question AND be', 1)
self.parserSuccess('question AND NOT gardenia', 1)
r, num = self.zc_index.query('question AND to AND be') self.parserSuccess('question AND gardenia', 0)
self.assertEqual(num, 1) self.parserSuccess('gardenia', 0)
self.assertEqual(r[0][0], 1) self.parserSuccess('question OR gardenia', 1)
self.parserSuccess('question AND NOT to AND NOT be', 1)
r, num = self.zc_index.query('to AND question AND be') self.parserSuccess('question OR to OR be', 1)
self.assertEqual(num, 1) self.parserSuccess('question to be', 1)
self.assertEqual(r[0][0], 1)
self.parserFailure('to be')
r, num = self.zc_index.query('to AND NOT question') self.parserFailure('to AND be')
self.assertEqual(num, 0) self.parserFailure('to OR be')
self.parserFailure('to AND NOT be')
r, num = self.zc_index.query('to AND NOT gardenia') self.parserFailure('to AND NOT question')
self.assertEqual(num, 0) self.parserFailure('to AND NOT gardenia')
r, num = self.zc_index.query('question AND NOT gardenia')
self.assertEqual(num, 1)
self.assertEqual(r[0][0], 1)
r, num = self.zc_index.query('question AND gardenia')
self.assertEqual(num, 0)
r, num = self.zc_index.query('gardenia')
self.assertEqual(num, 0)
r, num = self.zc_index.query('question OR gardenia')
self.assertEqual(num, 1)
self.assertEqual(r[0][0], 1)
r, num = self.zc_index.query('question AND NOT to AND NOT be')
self.assertEqual(num, 1)
self.assertEqual(r[0][0], 1)
r, num = self.zc_index.query('question OR to OR be')
self.assertEqual(num, 1)
self.assertEqual(r[0][0], 1)
r, num = self.zc_index.query('question to be')
self.assertEqual(num, 1)
self.assertEqual(r[0][0], 1)
r, num = self.zc_index.query('to be')
self.assertEqual(num, 0)
r, num = self.zc_index.query('to AND be')
self.assertEqual(num, 0)
r, num = self.zc_index.query('to OR be')
self.assertEqual(num, 0)
r, num = self.zc_index.query('to AND NOT be')
self.assertEqual(num, 0)
def testDocUpdate(self): def testDocUpdate(self):
docid = 1 # doesn't change -- we index the same doc repeatedly docid = 1 # doesn't change -- we index the same doc repeatedly
...@@ -482,10 +468,11 @@ class QueryTestsBase(testQueryEngine.TestQueryEngine, ...@@ -482,10 +468,11 @@ class QueryTestsBase(testQueryEngine.TestQueryEngine,
# XXX The FauxIndex and the real Index score documents very # XXX The FauxIndex and the real Index score documents very
# differently. The set comparison can't actually compare the # differently. The set comparison can't actually compare the
# items, but it can compare the keys. That will have to do for now. # items, but it can compare the keys. That will have to do for now.
d = {} setkeys = list(set.keys())
for k, v in set.items(): dictkeys = dict.keys()
d[k] = v setkeys.sort()
self.assertEqual(d.keys(), dict.keys()) dictkeys.sort()
self.assertEqual(setkeys, dictkeys)
class CosineQueryTests(QueryTestsBase): class CosineQueryTests(QueryTestsBase):
IndexFactory = CosineIndex IndexFactory = CosineIndex
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment