Commit b82b2746 authored by Guido van Rossum's avatar Guido van Rossum

Refactor the query parser to rely on the lexicon for parsing terms.

ILexicon.py:

  - Added parseTerms() and isGlob().

  - Added get_word(), get_wid() (get_word() is old; get_wid() for symmetry).

  - Reflowed some text.

IQueryParser.py:

  - Expanded docs for parseQuery().

  - Added getIgnored() and parseQueryEx().

IPipelineElement.py:

  - Added processGlob().

Lexicon.py:

  - Added parseTerms() and isGlob().

  - Added get_wid().

  - Some pipeline elements now support processGlob().

ParseTree.py:

  - Clarified the error message for calling executeQuery() on a
    NotNode.

QueryParser.py (lots of changes):

  - Change private names __tokens etc. into protected _tokens etc.

  - Add getIgnored() and parseQueryEx() methods.

  - The atom parser now uses the lexicon's parseTerms() and isGlob()
    methods.

  - Query parts that consist only of stopwords (as determined by the
    lexicon), or of stopwords and negated terms, yield None instead of
    a parse tree node; the ignored term is added to self._ignored.
    None is ignored when combining terms for AND/OR/NOT operators, and
    when an operator has no non-None operands, the operator itself
    returns None.  When this None percolates all the way to the top,
    the parser raises a ParseError exception.

tests/testQueryParser.py:

  - Changed test expressions of the form "a AND b AND c" to "aa AND bb
    AND cc" so that the terms won't be considered stopwords.

  - The test for "and/" can only work for the base class.

tests/testZCTextIndex.py:

  - Added copyright notice.

  - Refactor testStopWords() to have two helpers, one for success, one
    for failures.

  - Change testStopWords() to require parser failure for those queries
    that have only stopwords or stopwords plus negated terms.

  - Improve compareSet() to sort the sets of keys, and use a more
    direct way of extracting the keys.  This wasn't strictly needed
    (nothing fails without this), but the old approach of copying the
    keys into a dict in a loop depends on the dict hashing to always
    return keys in the same order.
parent 5f66a3ce
......@@ -22,8 +22,8 @@ class ILexicon(Interface):
The input text may be either a string or a list of strings.
Parses the text as if they are search terms, and skips words that
aren't in the lexicon.
Parse the text as if they are search terms, and skips words
that aren't in the lexicon.
"""
def sourceToWordIds(text):
......@@ -31,8 +31,9 @@ class ILexicon(Interface):
The input text may be either a string or a list of strings.
Parses the text as if they come from a source document, and creates
new word ids for words that aren't (yet) in the lexicon.
Parse the text as if they come from a source document, and
creates new word ids for words that aren't (yet) in the
lexicon.
"""
def globToWordIds(pattern):
......@@ -43,9 +44,34 @@ class ILexicon(Interface):
NOTE: Currently only a single trailing * is supported.
Returns the wids for all words in the lexicon that match the
Return the wids for all words in the lexicon that match the
pattern.
"""
def length():
"""Return the number of unique term in the lexicon."""
def get_word(wid):
"""Return the word for the given word id.
Raise KeyError if the word id is not in the lexicon.
"""
def get_wid(word):
"""Return the wird id for the given word.
Return 0 of the word is not in the lexicon.
"""
def parseTerms(text):
"""Pass the text through the pipeline.
Return a list of words, normalized by the pipeline
(e.g. stopwords removed, case normalized etc.).
"""
def isGlob(word):
"""Return true if the word is a globbing pattern.
The word should be one of the words returned by parseTerm().
"""
......@@ -21,3 +21,9 @@ class IPipelineElement(Interface):
Process a source sequence of words into a result sequence.
"""
def processGlob(source):
"""Process, passing through globbing metacharaters.
This is an optional method; if it is not used, process() is used.
"""
......@@ -24,6 +24,31 @@ class IQueryParser(Interface.Base):
Return a parse tree (which implements IQueryParseTree).
Some of the query terms may be ignored because they are
stopwords; use getIgnored() to find out which terms were
ignored. But if the entire query consists only of stop words,
or of stopwords and one or more negated terms, an exception is
raised.
May raise ParseTree.ParseError.
"""
def getIgnored():
"""Return the list of ignored terms.
Return the list of terms that were ignored by the most recent
call to parseQuery() because they were stopwords.
If parseQuery() was never called this returns None.
"""
def parseQueryEx(query):
"""Parse a query string.
Return a tuple (tree, ignored) where 'tree' is the parse tree
as returned by parseQuery(), and 'ignored' is a list of
ignored terms as returned by getIgnored().
May raise ParseTree.ParseError.
"""
......
......@@ -69,10 +69,22 @@ class Lexicon:
wids.append(self._wids.get(word, 0))
return wids
def parseTerms(self, text):
last = _text2list(text)
for element in self._pipeline:
process = getattr(element, "processGlob", element.process)
last = process(last)
return last
def isGlob(self, word):
return "*" in word
def get_word(self, wid):
"""Return the word for the given word id"""
return self._words[wid]
def get_wid(self, word):
return self._wids.get(word, 0)
def globToWordIds(self, pattern):
if not re.match("^\w+\*$", pattern):
return []
......@@ -116,6 +128,7 @@ class Splitter:
import re
rx = re.compile(r"\w+")
rxGlob = re.compile(r"\w+\*?")
def process(self, lst):
result = []
......@@ -123,6 +136,12 @@ class Splitter:
result += self.rx.findall(s)
return result
def processGlob(self, lst):
result = []
for s in lst:
result += self.rxGlob.findall(s)
return result
class CaseNormalizer:
def process(self, lst):
......
......@@ -58,7 +58,7 @@ class NotNode(ParseTreeNode):
return []
def executeQuery(self, index):
raise QueryError, "NOT operator must occur right after AND"
raise QueryError, "NOT parse tree node cannot be executed directly"
class AndNode(ParseTreeNode):
......
......@@ -83,7 +83,7 @@ _tokenizer_regex = re.compile(r"""
| -?
# followed by
(?:
# a string
# a string inside double quotes (and not containing these)
" [^"]* "
# or a non-empty stretch w/o whitespace, parens or double quotes
| [^()\s"]+
......@@ -92,46 +92,64 @@ _tokenizer_regex = re.compile(r"""
class QueryParser:
# This class is not thread-safe;
# each thread should have its own instance
def __init__(self, lexicon):
self._lexicon = lexicon
self._ignored = None
# Public API methods
def parseQuery(self, query):
# Lexical analysis.
tokens = _tokenizer_regex.findall(query)
self.__tokens = tokens
self._tokens = tokens
# classify tokens
self.__tokentypes = [_keywords.get(token.upper(), _ATOM)
for token in tokens]
self._tokentypes = [_keywords.get(token.upper(), _ATOM)
for token in tokens]
# add _EOF
self.__tokens.append(_EOF)
self.__tokentypes.append(_EOF)
self.__index = 0
self._tokens.append(_EOF)
self._tokentypes.append(_EOF)
self._index = 0
# Syntactical analysis.
self._ignored = [] # Ignored words in the query, for parseQueryEx
tree = self._parseOrExpr()
self._require(_EOF)
if tree is None:
raise ParseTree.ParseError(
"Query contains only common words: %s" % repr(query))
return tree
def getIgnored(self):
return self._ignored
def parseQueryEx(self, query):
tree = self.parseQueryEx(query)
ignored = self.getIgnored()
return tree, ignored
# Recursive descent parser
def _require(self, tokentype):
if not self._check(tokentype):
t = self.__tokens[self.__index]
t = self._tokens[self._index]
msg = "Token %r required, %r found" % (tokentype, t)
raise ParseTree.ParseError, msg
def _check(self, tokentype):
if self.__tokentypes[self.__index] is tokentype:
self.__index += 1
if self._tokentypes[self._index] is tokentype:
self._index += 1
return 1
else:
return 0
def _peek(self, tokentype):
return self.__tokentypes[self.__index] is tokentype
return self._tokentypes[self._index] is tokentype
def _get(self, tokentype):
t = self.__tokens[self.__index]
t = self._tokens[self._index]
self._require(tokentype)
return t
......@@ -140,16 +158,31 @@ class QueryParser:
L.append(self._parseAndExpr())
while self._check(_OR):
L.append(self._parseAndExpr())
if len(L) == 1:
L = filter(None, L)
if not L:
return None # Only stopwords
elif len(L) == 1:
return L[0]
else:
return ParseTree.OrNode(L)
def _parseAndExpr(self):
L = []
L.append(self._parseTerm())
t = self._parseTerm()
if t is not None:
L.append(t)
Nots = []
while self._check(_AND):
L.append(self._parseNotExpr())
t = self._parseNotExpr()
if t is None:
continue
if isinstance(t, ParseTree.NotNode):
Nots.append(t)
else:
L.append(t)
if not L:
return None # Only stopwords
L.extend(Nots)
if len(L) == 1:
return L[0]
else:
......@@ -157,7 +190,10 @@ class QueryParser:
def _parseNotExpr(self):
if self._check(_NOT):
return ParseTree.NotNode(self._parseTerm())
t = self._parseTerm()
if t is None:
return None # Only stopwords
return ParseTree.NotNode(t)
else:
return self._parseTerm()
......@@ -172,12 +208,13 @@ class QueryParser:
nodes = []
nots = []
for a in atoms:
words = re.findall(r"\w+\*?", a)
words = self._lexicon.parseTerms(a)
if not words:
continue
self._ignored.append(a)
continue # Only stopwords
if len(words) > 1:
n = ParseTree.PhraseNode(" ".join(words))
elif words[0].endswith("*"):
elif self._lexicon.isGlob(words[0]):
n = ParseTree.GlobNode(words[0])
else:
n = ParseTree.AtomNode(words[0])
......@@ -187,9 +224,7 @@ class QueryParser:
else:
nodes.append(n)
if not nodes:
text = " ".join(atoms)
msg = "At least one positive term required: %r" % text
raise ParseTree.ParseError, msg
return None # Only stowords
nodes.extend(nots)
if len(nodes) == 1:
tree = nodes[0]
......
......@@ -61,18 +61,18 @@ class TestQueryParser(TestCase):
def testParseQuery(self):
self.expect("foo", AtomNode("foo"))
self.expect("note", AtomNode("note"))
self.expect("a and b AND c",
AndNode([AtomNode("a"), AtomNode("b"), AtomNode("c")]))
self.expect("a OR b or c",
OrNode([AtomNode("a"), AtomNode("b"), AtomNode("c")]))
self.expect("a AND b OR c AnD d",
OrNode([AndNode([AtomNode("a"), AtomNode("b")]),
AndNode([AtomNode("c"), AtomNode("d")])]))
self.expect("(a OR b) AND (c OR d)",
AndNode([OrNode([AtomNode("a"), AtomNode("b")]),
OrNode([AtomNode("c"), AtomNode("d")])]))
self.expect("a AND not b",
AndNode([AtomNode("a"), NotNode(AtomNode("b"))]))
self.expect("aa and bb AND cc",
AndNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
self.expect("aa OR bb or cc",
OrNode([AtomNode("aa"), AtomNode("bb"), AtomNode("cc")]))
self.expect("aa AND bb OR cc AnD dd",
OrNode([AndNode([AtomNode("aa"), AtomNode("bb")]),
AndNode([AtomNode("cc"), AtomNode("dd")])]))
self.expect("(aa OR bb) AND (cc OR dd)",
AndNode([OrNode([AtomNode("aa"), AtomNode("bb")]),
OrNode([AtomNode("cc"), AtomNode("dd")])]))
self.expect("aa AND not bb",
AndNode([AtomNode("aa"), NotNode(AtomNode("bb"))]))
self.expect('"foo bar"', PhraseNode("foo bar"))
self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
......@@ -80,7 +80,10 @@ class TestQueryParser(TestCase):
self.expect('(("foo bar"))"', PhraseNode("foo bar"))
self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))
self.expect('and/', AtomNode("and"))
if self.__class__ is TestQueryParser:
# This test fails when testZCTextIndex subclasses this class,
# because its lexicon's pipeline removes stopwords
self.expect('and/', AtomNode("and"))
self.expect("foo-bar", PhraseNode("foo bar"))
self.expect("foo -bar", AndNode([AtomNode("foo"),
......
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
from Products.ZCTextIndex.tests \
import testIndex, testQueryEngine, testQueryParser
......@@ -9,6 +23,7 @@ from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.StopDict import get_stopdict
from Products.ZCTextIndex.ParseTree import ParseError
import re
import unittest
......@@ -84,6 +99,15 @@ class ZCIndexTestsBase:
self.index = self.zc_index.index
self.lexicon = self.zc_index.lexicon
def parserFailure(self, query):
self.assertRaises(ParseError, self.zc_index.query, query)
def parserSuccess(self, query, n):
r, num = self.zc_index.query(query)
self.assertEqual(num, n)
if n:
self.assertEqual(r[0][0], 1)
def testStopWords(self):
# the only non-stopword is question
text = ("to be or not to be "
......@@ -96,61 +120,23 @@ class ZCIndexTestsBase:
self.assertEqual(wids, [])
self.assertEqual(len(self.index.get_words(1)), 1)
r, num = self.zc_index.query('question')
self.assertEqual(num, 1)
self.assertEqual(r[0][0], 1)
r, num = self.zc_index.query('question AND to AND be')
self.assertEqual(num, 1)
self.assertEqual(r[0][0], 1)
r, num = self.zc_index.query('to AND question AND be')
self.assertEqual(num, 1)
self.assertEqual(r[0][0], 1)
r, num = self.zc_index.query('to AND NOT question')
self.assertEqual(num, 0)
r, num = self.zc_index.query('to AND NOT gardenia')
self.assertEqual(num, 0)
r, num = self.zc_index.query('question AND NOT gardenia')
self.assertEqual(num, 1)
self.assertEqual(r[0][0], 1)
r, num = self.zc_index.query('question AND gardenia')
self.assertEqual(num, 0)
r, num = self.zc_index.query('gardenia')
self.assertEqual(num, 0)
r, num = self.zc_index.query('question OR gardenia')
self.assertEqual(num, 1)
self.assertEqual(r[0][0], 1)
r, num = self.zc_index.query('question AND NOT to AND NOT be')
self.assertEqual(num, 1)
self.assertEqual(r[0][0], 1)
r, num = self.zc_index.query('question OR to OR be')
self.assertEqual(num, 1)
self.assertEqual(r[0][0], 1)
r, num = self.zc_index.query('question to be')
self.assertEqual(num, 1)
self.assertEqual(r[0][0], 1)
r, num = self.zc_index.query('to be')
self.assertEqual(num, 0)
r, num = self.zc_index.query('to AND be')
self.assertEqual(num, 0)
r, num = self.zc_index.query('to OR be')
self.assertEqual(num, 0)
r, num = self.zc_index.query('to AND NOT be')
self.assertEqual(num, 0)
self.parserSuccess('question', 1)
self.parserSuccess('question AND to AND be', 1)
self.parserSuccess('to AND question AND be', 1)
self.parserSuccess('question AND NOT gardenia', 1)
self.parserSuccess('question AND gardenia', 0)
self.parserSuccess('gardenia', 0)
self.parserSuccess('question OR gardenia', 1)
self.parserSuccess('question AND NOT to AND NOT be', 1)
self.parserSuccess('question OR to OR be', 1)
self.parserSuccess('question to be', 1)
self.parserFailure('to be')
self.parserFailure('to AND be')
self.parserFailure('to OR be')
self.parserFailure('to AND NOT be')
self.parserFailure('to AND NOT question')
self.parserFailure('to AND NOT gardenia')
def testDocUpdate(self):
docid = 1 # doesn't change -- we index the same doc repeatedly
......@@ -482,10 +468,11 @@ class QueryTestsBase(testQueryEngine.TestQueryEngine,
# XXX The FauxIndex and the real Index score documents very
# differently. The set comparison can't actually compare the
# items, but it can compare the keys. That will have to do for now.
d = {}
for k, v in set.items():
d[k] = v
self.assertEqual(d.keys(), dict.keys())
setkeys = list(set.keys())
dictkeys = dict.keys()
setkeys.sort()
dictkeys.sort()
self.assertEqual(setkeys, dictkeys)
class CosineQueryTests(QueryTestsBase):
IndexFactory = CosineIndex
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment