Commit a8226c4e authored by Guido van Rossum's avatar Guido van Rossum

Get rid of the unused HTMLSplitter class (it's too simple).

Add glob support to the HTMLWordSplitter class.
parent fbd41e2f
...@@ -17,33 +17,26 @@ from Products.ZCTextIndex.PipelineFactory import element_factory ...@@ -17,33 +17,26 @@ from Products.ZCTextIndex.PipelineFactory import element_factory
import re import re
class HTMLSplitter:
__implements__ = ISplitter
def process(self, text):
return re.sub('<[^>]*>', ' ', text).split()
class HTMLWordSplitter: class HTMLWordSplitter:
__implements__ = ISplitter __implements__ = ISplitter
def process(self, text): def process(self, text, wordpat=r"\w+"):
splat = [] splat = []
for t in text: for t in text:
splat += self._split(t) splat += self._split(t, wordpat)
return splat return splat
def _split(self, text): def processGlob(self, text):
return self.process(text, r"\w+[\w*?]*") # see Lexicon.globToWordIds()
def _split(self, text, wordpat):
text = text.lower() text = text.lower()
remove = ["<[^>]*>", remove = [r"<[^<>]*>",
"&[A-Za-z]+;", r"&[A-Za-z]+;"]
"\W+"]
for pat in remove: for pat in remove:
text = re.sub(pat, " ", text) text = re.sub(pat, " ", text)
rx = re.compile("[A-Za-z]") return re.findall(wordpat, text)
return [word for word in text.split()
if len(word) > 1 and rx.search(word)]
element_factory.registerFactory('Word Splitter', element_factory.registerFactory('Word Splitter',
'HTML aware splitter', 'HTML aware splitter',
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment