Use feedparse python library rather than hand parse XML.

Tranlate UI messages. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@40030 20353a03-c40f-0410-a6d1-a30d3c3de9de

Use feedparse python library rather than hand parse XML.
Tranlate UI messages. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@40030 20353a03-c40f-0410-a6d1-a30d3c3de9de
ccbe2120 · Ivan Tyagov · 1ddbf6ac · ccbe2120 · ccbe2120 · ccbe2120
Commit ccbe2120 authored Nov 08, 2010 by Ivan Tyagov
4 changed files
--- a/bt5/erp5_knowledge_pad/ExtensionTemplateItem/GetRssDataAsDict.py
+++ b/bt5/erp5_knowledge_pad/ExtensionTemplateItem/GetRssDataAsDict.py
-from urllib2 import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler, \
-     build_opener, install_opener, urlopen, HTTPError
-from xml.dom.minidom import parseString
-import md5
-from HTMLParser import HTMLParser
-import socket
-
-def getRssDataAsDict(url, username, password):
-  passman = HTTPPasswordMgrWithDefaultRealm()
-  passman.add_password(None, url, username, password)
-  auth_handler = HTTPBasicAuthHandler(passman)
-  opener = build_opener(auth_handler)
-  install_opener(opener)
-  try:
-    default_timeout = socket.getdefaulttimeout()
-    socket.setdefaulttimeout(5.0)
-    try:
-      file = urlopen(url)
-    finally:
-      socket.setdefaulttimeout(default_timeout)
-      
-  except IOError , e:
-    return {'title': 'Connection problem, please retry later.'}
-  except ValueError , e:
-   return {'title': 'Please enter a valid Rss or Atom url in the preference form.' }
-  except HTTPError , e:
-    if hasattr(e, 'code'):
-      if e.code == 401:
-        return {'title': 'Unauthorized, verify your authentication.' }
-      if e.code == 404:
-        return {'title': 'Page not found.' }
-  except :
-    return {'title': 'Fetching Rss failed.' }
-  return parseRssDataAsDict(file.read())
-
-def parseRssDataAsDict(rss_string):
-  try:
-    xmlDoc = parseString(rss_string).documentElement
-  except :
-    return {'title': 'Parsing RSS failed.' }
-  if(xmlDoc.tagName.startswith('rss') or xmlDoc.tagName.startswith('rdf') ):
-    feed_data = {}
-    RSSTitle = None
-    if (xmlDoc.getElementsByTagName('title') and xmlDoc.getElementsByTagName('title')[0].parentNode.tagName != 'item'):
-      feed_data['title'] = xmlDoc.getElementsByTagName('title')[0].firstChild.nodeValue
-    if (xmlDoc.getElementsByTagName('image') and xmlDoc.getElementsByTagName('image')[0].parentNode.tagName != 'item'):
-      logo = xmlDoc.getElementsByTagName('image')[0]
-      if (logo.getElementsByTagName('url')):
-        feed_data['logo'] = logo.getElementsByTagName('url')[0].firstChild.nodeValue
-      elif(logo.getElementsByTagName('rdf:resource')):
-        feed_data['logo'] = logo.getElementsByTagName('rdf:resource')[0].firstChild.nodeValue
-    if (xmlDoc.getElementsByTagName('link') and xmlDoc.getElementsByTagName('link')[0].parentNode.tagName != 'item'):
-      feed_data['link'] = xmlDoc.getElementsByTagName('link')[0].firstChild.nodeValue
-    item_list = xmlDoc.getElementsByTagName('item')
-    feed_data['items'] = []
-    for item in item_list:
-      message = {}
-      message['other_links'] = []
-      message['img'] = []
-      if(item.getElementsByTagName('title') and item.getElementsByTagName('title')[0].firstChild):
-        message['title'] = item.getElementsByTagName('title')[0].firstChild.nodeValue
-      if(item.getElementsByTagName('link') and item.getElementsByTagName('link')[0].firstChild):
-        message['link'] = item.getElementsByTagName('link')[0].firstChild.nodeValue
-      if(item.getElementsByTagName('description') and item.getElementsByTagName('description')[0].firstChild):
-        message['content'] = cleanHTML(item.getElementsByTagName('description')[0].firstChild.nodeValue)
-      if (item.getElementsByTagName('pubDate') and item.getElementsByTagName('pubDate')[0].firstChild):
-        message['date'] = item.getElementsByTagName('pubDate')[0].firstChild.nodeValue
-      elif(item.getElementsByTagName('dc:date') and item.getElementsByTagName('dc:date')[0].firstChild):
-        message['date'] = item.getElementsByTagName('dc:date')[0].firstChild.nodeValue
-      if (item.getElementsByTagName('enclosure')):
-        for enclosure in item.getElementsByTagName('enclosure'):
-          if (str(enclosure.attributes['type'].nodeValue).find('image') != -1):
-            message['img'].append(enclosure.attributes['url'].nodeValue)
-          else:
-            if (enclosure.attributes.has_key('title')):
-              message['other_links'].append('<a href="'+enclosure.attributes['url'].nodeValue+'"target="_blank">'+enclosure.attributes['url'].nodeValue+'</a>')
-            else:
-              message['other_links'].append('<a href="'+enclosure.attributes['url'].nodeValue+'"target="_blank">'+enclosure.attributes['title'].nodeValue+'</a>')
-      message['md5'] = md5.new(str(message)).hexdigest()
-      feed_data['items'].append(message)
-  elif(xmlDoc.tagName == 'feed'):
-    feed_data = {}
-    feedTitle = None
-    if (xmlDoc.getElementsByTagName('title') and xmlDoc.getElementsByTagName('title')[0].parentNode.tagName != 'entry'):
-      feed_data['title'] = xmlDoc.getElementsByTagName('title')[0].firstChild.nodeValue
-    if (xmlDoc.getElementsByTagName('icon') and xmlDoc.getElementsByTagName('icon')[0].parentNode.tagName != 'entry'):
-      feed_data['logo'] = xmlDoc.getElementsByTagName('icon')[0].firstChild.nodeValue
-    item_list = xmlDoc.getElementsByTagName('entry')
-    feed_data['items'] = []
-    for item in item_list:
-      message = {}
-      if(item.getElementsByTagName('title') and item.getElementsByTagName('title')[0].firstChild):
-        message['title'] = item.getElementsByTagName('title')[0].firstChild.nodeValue
-      message['other_links'] = []
-      message['img'] = []
-      for link in item.getElementsByTagName('link'):
-        if (link.attributes.has_key('rel') and link.attributes.get('rel').nodeValue == 'alternate'):
-          message['link'] = link.attributes['href'].nodeValue
-        elif (link.attributes.has_key('type') and link.attributes.get('type').nodeValue.find('image') != -1):
-          message['img'].append(link.attributes['href'].nodeValue)
-        else:
-          if (link.attributes.has_key('title')):
-            message['other_links'].append('<a href="'+link.attributes['href'].nodeValue+'" target="_blank">'+link.attributes['title'].nodeValue+'</a>')
-          else:
-            message['other_links'].append('<a href="'+link.attributes['href'].nodeValue+'"target="_blank">'+link.attributes['href'].nodeValue+'</a>')
-      if (item.getElementsByTagName('content') and item.getElementsByTagName('content')[0].firstChild):
-        message['content'] = stringConstructor(item.getElementsByTagName('content')[0])
-      elif (item.getElementsByTagName('summary') and item.getElementsByTagName('summary')[0].firstChild):
-        message['content'] = stringConstructor(item.getElementsByTagName('summary')[0])
-      if (item.getElementsByTagName('updated') and item.getElementsByTagName('updated')[0].firstChild):
-        message['date'] = item.getElementsByTagName('updated')[0].firstChild.nodeValue
-      elif (item.getElementsByTagName('modified') and item.getElementsByTagName('modified')[0].firstChild):
-        message['date'] = item.getElementsByTagName('modified')[0].firstChild.nodeValue
-      message['md5'] = md5.new(str(message)).hexdigest()
-      feed_data['items'].append(message)
-  else:
-    return {'title': 'This reader can\'t read this feed'}
-  return feed_data
-
-
-class HTMLCleaner(HTMLParser):
-  def __init__(self):
-    HTMLParser.__init__(self)
-    self.html = ''
-    self.script = 0
-  def handle_starttag(self, tag, attrs):
-    if tag !='script' and tag !='input' and tag !='button' :
-      self.html += '<'+tag+' '
-      for attr in attrs:
-        if not attr[0].startswith('on'):
-          self.html += attr[0]+'=' +attr[1]+' '
-      if tag=='a':
-        self.html += 'target="_blank" '
-      self.html += '>'
-    else:
-      self.script = 1
-  def handle_data(self, data):
-    if not self.script:
-      self.html += data
-  def handle_charref(self, name):
-    self.html += '&#'+name+';'
-  def handle_entityref(self, name):
-    self.html += '&'+name+';'
-  def handle_endtag(self, tag):
-    if tag !='script' and tag !='input' and tag !='button' :
-      self.html += '</'+tag+'>'
-    else:
-      self.script = 0
-  def handle_startendtag(self, tag, attrs):
-    if tag !='script' and tag !='input' and tag !='button' :
-      self.html += '<'+tag+' '
-      for attr in attrs:
-        if not attr[0].startswith('on'):
-          self.html += attr[0]+'=' +attr[1]+' '
-      self.html += '/>'
-
-def cleanHTML(string):
-  html = ''
-  parser= HTMLCleaner()
-  parser.feed(string)
-  return parser.html
-
-def stringConstructor(domItem):
-  string = ''
-  for item in domItem.childNodes:
-    if item.nodeType == 3:
-      string = string + item.nodeValue
-    elif item.nodeType == 1 and item.tagName != 'script' and item.tagName != 'input' and item.tagName != 'button':
-      string = string + '<' + item.tagName + ' '
-      if item.attributes:
-        for att in item.attributes.items():
-          if(not att[0].startswith('on')):
-            string = string + att[0] + '=' + att[1] + ' '
-      if item.tagName == 'a':
-        string = string + 'target="_blank" '
-      string = string + '>'
-      string = string + stringConstructor(item)
-      string = string + '</' + item.tagName + '>'
-  return string
+import feedparser, md5, urllib2, socket
+  
+def getRssDataAsDict(self, url, username=None, password=None):
+  result = {}
+  translate = self.Base_translateString
+  # no url, no feed to read
+  if url in ('', None, 'None',):
+    return {'title':translate('Please enter a valid Rss or Atom url in the preference form.')}
+    
+  # use authentication or not?
+  handlers = []
+  if username is not None and password is not None:
+    passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
+    passman.add_password(None, url, username, password)
+    auth_handler = urllib2.HTTPBasicAuthHandler(passman)
+    handlers.append(auth_handler)
+  
+  # set shorter timeouts and revert default at enf of read  
+  default_timeout = socket.getdefaulttimeout()
+  socket.setdefaulttimeout(10.0)
+  d = feedparser.parse(url, handlers=handlers)  
+  socket.setdefaulttimeout(default_timeout)    
+    
+  if d.bozo and isinstance(d.bozo_exception, urllib2.URLError):
+    # we have an URL error
+    return {'title':translate('Wrong Rss or Atom url or service temporary down.')}
+    
+  # http status code checks
+  if d.status == 401:
+    return {'title': translate('Unauthorized, verify your authentication.')}
+  elif d.status == 404:
+    return {'title': translate('Page not found.')}
+  
+  result['items'] = []
+  # some feeds may not provide logo
+  if d.feed.get('image', None) is not None:
+    result['logo'] = d.feed.image['href']
+  result['title'] = d.feed.title
+  result['link'] = d.feed.link
+  for entry in d.entries:
+    entry_dict = {}
+    entry_dict['title'] = entry['title']
+    entry_dict['link'] = entry['link']
+    entry_dict['other_links'] = [x['href'] for x in entry['links']]
+    entry_dict['md5'] = md5.new(entry['link']).hexdigest() 
+    entry_dict['content'] = entry['summary']
+    entry_dict['date'] = entry['updated']
+    entry_dict['img'] = [x['href'] for x in entry.get('enclosures', [])]
+    entry_dict['updated_parsed'] = entry['updated_parsed']
+    result['items'].append(entry_dict)
+  # sort by date
+  result['items'] = sorted(result['items'], key=lambda k: k['updated_parsed'])
+  result['items'].reverse()
+  return result
\ No newline at end of file
--- a/bt5/erp5_knowledge_pad/SkinTemplateItem/portal_skins/erp5_gadget/Base_getRssDataAsDocumentList.xml
+++ b/bt5/erp5_knowledge_pad/SkinTemplateItem/portal_skins/erp5_gadget/Base_getRssDataAsDocumentList.xml
@@ -69,7 +69,7 @@ else:\n
 feed_url = str(preferences.get(\'preferred_rss_feed\',\'\'))\n
 username = str(preferences.get(\'preferred_username\',\'\'))\n
 password = str(preferences.get(\'preferred_password\',\'\'))\n
-results = context.Base_getRssDataAsDict(url = feed_url, username = username, password = password)\n
+results = context.Base_getRssDataAsDict(context, url = feed_url, username = username, password = password)\n
 readItemList = {}\n
 md5_list = []\n
 message_list = []\n

--- a/bt5/erp5_knowledge_pad/SkinTemplateItem/portal_skins/erp5_gadget/RssFeed_getSummaryAsHTML.xml
+++ b/bt5/erp5_knowledge_pad/SkinTemplateItem/portal_skins/erp5_gadget/RssFeed_getSummaryAsHTML.xml
@@ -57,7 +57,7 @@
         tal:attributes="class python: test(is_read, \'teaser read\', \'teaser unread\');\n
                         onclick string:if(this.className!=\'teaser read\'){this.className=\'teaser read\';;MochiKit.Async.doSimpleXMLHttpRequest(\'Base_setRssItemReadInSelection\', {\'selection_name\':\'${selection_name}\',\'item\':\'${md5}\'})};;toggle(\'${dom_id}\');">\n
      <img class="thumbnail-image"\n
-           tal:condition="python: image not in ((), None,)"\n
+           tal:condition="python: image not in ((), None, [])"\n
           tal:attributes="src python: image[0]" alt="Thumbnail"/>\n
      <span class="document-title"\n
            tal:content="title"/> - \n

--- a/bt5/erp5_knowledge_pad/bt/revision
+++ b/bt5/erp5_knowledge_pad/bt/revision
-597
\ No newline at end of file
+601
\ No newline at end of file