scan only text/html, skip anchors, don't follow mailto

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@10487 20353a03-c40f-0410-a6d1-a30d3c3de9de

scan only text/html, skip anchors, don't follow mailto
git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@10487 20353a03-c40f-0410-a6d1-a30d3c3de9de
731b9498 · Bartek Górny · 4995cde0 · 731b9498
Commit 731b9498 authored Oct 02, 2006 by Bartek Górny
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 5 deletions

product/ERP5OOo/Document/ExternalWebPage.py product/ERP5OOo/Document/ExternalWebPage.py +9 -5

No files found.
--- a/product/ERP5OOo/Document/ExternalWebPage.py
+++ b/product/ERP5OOo/Document/ExternalWebPage.py
@@ -160,7 +160,11 @@ class ExternalWebPage(ExternalDocument):
    self.urldict={}
    self._p_changed=1

-  def _processData(self,s):
+  def _processData(self,s, inf):
+    # since this is a web page, we don't want anything else
+    # XXX we should find another way - like this, we end up with empty draft objects
+    if (inf.getmaintype(),inf.getsubtype())!=('text','html'):
+      raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype()))
    top=self._findTopObject()
    # record my url in top object
    top.addUrl(self.getQualifiedUrl())
@@ -170,11 +174,11 @@ class ExternalWebPage(ExternalDocument):
      # first find links in text
      rx=re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
      for ref in re.findall(rx, s):
-        if ref.startswith('javascript'):
+        # eliminate anchors and specials, select internal links
+        if ref.startswith('javascript') or ref.startswith('mailto'):
          continue
-        # XXX not sure where to store those already spidered
-        # for now, the only precaution against infinite loop is recursion depth
-        # select internal links
+        ref=re.sub('#.*','',ref)
+        if ref=='':continue
        baseref='/'.join(self.getQualifiedUrl().split('/')[:-1])
        if not ref.startswith('http'):
          # complete relative paths