Commit 097d0e19 authored by Bartek Górny's avatar Bartek Górny

fixed url recording; clean multiple slashes from urls;

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@10538 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 163757eb
......@@ -166,8 +166,6 @@ class ExternalWebPage(ExternalDocument):
if (inf.getmaintype(),inf.getsubtype())!=('text','html'):
raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype()))
top=self._findTopObject()
# record my url in top object
top.addUrl(self.getQualifiedUrl())
# remove current subobjects
self.manage_delObjects([i.getId() for i in self.searchFolder(portal_type='External Web Page')])
if self.getOptionRecursively()>0 and self.getRecursionDepth()>0:
......@@ -179,12 +177,18 @@ class ExternalWebPage(ExternalDocument):
continue
ref=re.sub('#.*','',ref)
if ref=='':continue
baseref='/'.join(self.getQualifiedUrl().split('/')[:-1])
#baseref='/'.join(self.getQualifiedUrl().split('/'))
baseref=self.getQualifiedUrl()
if not ref.startswith('http'):
# complete relative paths
ref=baseref+'/'+ref
# eliminate multiple slashes
rx=re.compile('([^:]{1})\/{2,}')
ref=re.sub(rx,'\1/',ref)
# create subobjects
if ref.startswith(baseref) and not top.checkUrl(ref):
# record my url in top object
top.addUrl(ref)
n=self.newContent(portal_type='External Web Page')
# set coordinates
n.setUrlProtocol('http')
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment