Commit 097d0e19 authored by Bartek Górny's avatar Bartek Górny

fixed url recording; clean multiple slashes from urls;

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@10538 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 163757eb
...@@ -166,8 +166,6 @@ class ExternalWebPage(ExternalDocument): ...@@ -166,8 +166,6 @@ class ExternalWebPage(ExternalDocument):
if (inf.getmaintype(),inf.getsubtype())!=('text','html'): if (inf.getmaintype(),inf.getsubtype())!=('text','html'):
raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype())) raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype()))
top=self._findTopObject() top=self._findTopObject()
# record my url in top object
top.addUrl(self.getQualifiedUrl())
# remove current subobjects # remove current subobjects
self.manage_delObjects([i.getId() for i in self.searchFolder(portal_type='External Web Page')]) self.manage_delObjects([i.getId() for i in self.searchFolder(portal_type='External Web Page')])
if self.getOptionRecursively()>0 and self.getRecursionDepth()>0: if self.getOptionRecursively()>0 and self.getRecursionDepth()>0:
...@@ -179,12 +177,18 @@ class ExternalWebPage(ExternalDocument): ...@@ -179,12 +177,18 @@ class ExternalWebPage(ExternalDocument):
continue continue
ref=re.sub('#.*','',ref) ref=re.sub('#.*','',ref)
if ref=='':continue if ref=='':continue
baseref='/'.join(self.getQualifiedUrl().split('/')[:-1]) #baseref='/'.join(self.getQualifiedUrl().split('/'))
baseref=self.getQualifiedUrl()
if not ref.startswith('http'): if not ref.startswith('http'):
# complete relative paths # complete relative paths
ref=baseref+'/'+ref ref=baseref+'/'+ref
# eliminate multiple slashes
rx=re.compile('([^:]{1})\/{2,}')
ref=re.sub(rx,'\1/',ref)
# create subobjects # create subobjects
if ref.startswith(baseref) and not top.checkUrl(ref): if ref.startswith(baseref) and not top.checkUrl(ref):
# record my url in top object
top.addUrl(ref)
n=self.newContent(portal_type='External Web Page') n=self.newContent(portal_type='External Web Page')
# set coordinates # set coordinates
n.setUrlProtocol('http') n.setUrlProtocol('http')
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment