Commit 731b9498 authored by Bartek Górny's avatar Bartek Górny

scan only text/html, skip anchors, don't follow mailto

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@10487 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 4995cde0
...@@ -160,7 +160,11 @@ class ExternalWebPage(ExternalDocument): ...@@ -160,7 +160,11 @@ class ExternalWebPage(ExternalDocument):
self.urldict={} self.urldict={}
self._p_changed=1 self._p_changed=1
def _processData(self,s): def _processData(self,s, inf):
# since this is a web page, we don't want anything else
# XXX we should find another way - like this, we end up with empty draft objects
if (inf.getmaintype(),inf.getsubtype())!=('text','html'):
raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype()))
top=self._findTopObject() top=self._findTopObject()
# record my url in top object # record my url in top object
top.addUrl(self.getQualifiedUrl()) top.addUrl(self.getQualifiedUrl())
...@@ -170,11 +174,11 @@ class ExternalWebPage(ExternalDocument): ...@@ -170,11 +174,11 @@ class ExternalWebPage(ExternalDocument):
# first find links in text # first find links in text
rx=re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE) rx=re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
for ref in re.findall(rx, s): for ref in re.findall(rx, s):
if ref.startswith('javascript'): # eliminate anchors and specials, select internal links
if ref.startswith('javascript') or ref.startswith('mailto'):
continue continue
# XXX not sure where to store those already spidered ref=re.sub('#.*','',ref)
# for now, the only precaution against infinite loop is recursion depth if ref=='':continue
# select internal links
baseref='/'.join(self.getQualifiedUrl().split('/')[:-1]) baseref='/'.join(self.getQualifiedUrl().split('/')[:-1])
if not ref.startswith('http'): if not ref.startswith('http'):
# complete relative paths # complete relative paths
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment