diff --git a/product/ERP5OOo/Document/ExternalWebPage.py b/product/ERP5OOo/Document/ExternalWebPage.py index f4916dd01d872b89302e4ea2290dcbb46aae8e8f..3d3cba865951215de68263f828fbf49d1a9d994b 100644 --- a/product/ERP5OOo/Document/ExternalWebPage.py +++ b/product/ERP5OOo/Document/ExternalWebPage.py @@ -160,7 +160,11 @@ class ExternalWebPage(ExternalDocument): self.urldict={} self._p_changed=1 - def _processData(self,s): + def _processData(self,s, inf): + # since this is a web page, we don't want anything else + # XXX we should find another way - like this, we end up with empty draft objects + if (inf.getmaintype(),inf.getsubtype())!=('text','html'): + raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype())) top=self._findTopObject() # record my url in top object top.addUrl(self.getQualifiedUrl()) @@ -170,11 +174,11 @@ class ExternalWebPage(ExternalDocument): # first find links in text rx=re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE) for ref in re.findall(rx, s): - if ref.startswith('javascript'): + # eliminate anchors and specials, select internal links + if ref.startswith('javascript') or ref.startswith('mailto'): continue - # XXX not sure where to store those already spidered - # for now, the only precaution against infinite loop is recursion depth - # select internal links + ref=re.sub('#.*','',ref) + if ref=='':continue baseref='/'.join(self.getQualifiedUrl().split('/')[:-1]) if not ref.startswith('http'): # complete relative paths