From 097d0e19f81a3e191ae2bf6d6a63e23aa06d9589 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bartek=20G=C3=B3rny?= <bartek@gorny.edu.pl> Date: Wed, 4 Oct 2006 15:35:29 +0000 Subject: [PATCH] fixed url recording; clean multiple slashes from urls; git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@10538 20353a03-c40f-0410-a6d1-a30d3c3de9de --- product/ERP5OOo/Document/ExternalWebPage.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/product/ERP5OOo/Document/ExternalWebPage.py b/product/ERP5OOo/Document/ExternalWebPage.py index 3d3cba8659..5d5eca39eb 100644 --- a/product/ERP5OOo/Document/ExternalWebPage.py +++ b/product/ERP5OOo/Document/ExternalWebPage.py @@ -166,8 +166,6 @@ class ExternalWebPage(ExternalDocument): if (inf.getmaintype(),inf.getsubtype())!=('text','html'): raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype())) top=self._findTopObject() - # record my url in top object - top.addUrl(self.getQualifiedUrl()) # remove current subobjects self.manage_delObjects([i.getId() for i in self.searchFolder(portal_type='External Web Page')]) if self.getOptionRecursively()>0 and self.getRecursionDepth()>0: @@ -179,12 +177,18 @@ class ExternalWebPage(ExternalDocument): continue ref=re.sub('#.*','',ref) if ref=='':continue - baseref='/'.join(self.getQualifiedUrl().split('/')[:-1]) + #baseref='/'.join(self.getQualifiedUrl().split('/')) + baseref=self.getQualifiedUrl() if not ref.startswith('http'): # complete relative paths ref=baseref+'/'+ref + # eliminate multiple slashes + rx=re.compile('([^:]{1})\/{2,}') + ref=re.sub(rx,'\1/',ref) # create subobjects if ref.startswith(baseref) and not top.checkUrl(ref): + # record my url in top object + top.addUrl(ref) n=self.newContent(portal_type='External Web Page') # set coordinates n.setUrlProtocol('http') -- 2.30.9