Get rid of the unused HTMLSplitter class (it's too simple).

Add glob support to the HTMLWordSplitter class.

Get rid of the unused HTMLSplitter class (it's too simple).
Add glob support to the HTMLWordSplitter class.
a8226c4e · Guido van Rossum · fbd41e2f · a8226c4e
Commit a8226c4e authored May 22, 2002 by Guido van Rossum
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 16 deletions

lib/python/Products/ZCTextIndex/HTMLSplitter.py lib/python/Products/ZCTextIndex/HTMLSplitter.py +9 -16

No files found.
--- a/lib/python/Products/ZCTextIndex/HTMLSplitter.py
+++ b/lib/python/Products/ZCTextIndex/HTMLSplitter.py
@@ -17,33 +17,26 @@ from Products.ZCTextIndex.PipelineFactory import element_factory
 import re
-class HTMLSplitter:
-    __implements__ = ISplitter
-    def process(self, text):
-        return re.sub('<[^>]*>', ' ', text).split()
 class HTMLWordSplitter:
    __implements__ = ISplitter
-    def process(self, text):
+    def process(self, text, wordpat=r"\w+"):
        splat = []
        for t in text:
-            splat += self._split(t)
+            splat += self._split(t, wordpat)
        return splat
-    def _split(self, text):
+    def processGlob(self, text):
+        return self.process(text, r"\w+[\w*?]*") # see Lexicon.globToWordIds()
+    def _split(self, text, wordpat):
        text = text.lower()
-        remove = ["<[^>]*>",
+        remove = [r"<[^<>]*>",
-                  "&[A-Za-z]+;",
+                  r"&[A-Za-z]+;"]
-                  "\W+"]
        for pat in remove:
            text = re.sub(pat, " ", text)
-        rx = re.compile("[A-Za-z]")
+        return re.findall(wordpat, text)
-        return [word for word in text.split()
-                if len(word) > 1 and rx.search(word)]
 element_factory.registerFactory('Word Splitter', 
                                'HTML aware splitter',