Commit ccbe2120 authored by Ivan Tyagov's avatar Ivan Tyagov

Use feedparse python library rather than hand parse XML.

Tranlate UI messages.

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@40030 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 1ddbf6ac
from urllib2 import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler, \
build_opener, install_opener, urlopen, HTTPError
from xml.dom.minidom import parseString
import md5
from HTMLParser import HTMLParser
import socket
def getRssDataAsDict(url, username, password):
passman = HTTPPasswordMgrWithDefaultRealm()
passman.add_password(None, url, username, password)
auth_handler = HTTPBasicAuthHandler(passman)
opener = build_opener(auth_handler)
install_opener(opener)
try:
default_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(5.0)
try:
file = urlopen(url)
finally:
socket.setdefaulttimeout(default_timeout)
except IOError , e:
return {'title': 'Connection problem, please retry later.'}
except ValueError , e:
return {'title': 'Please enter a valid Rss or Atom url in the preference form.' }
except HTTPError , e:
if hasattr(e, 'code'):
if e.code == 401:
return {'title': 'Unauthorized, verify your authentication.' }
if e.code == 404:
return {'title': 'Page not found.' }
except :
return {'title': 'Fetching Rss failed.' }
return parseRssDataAsDict(file.read())
def parseRssDataAsDict(rss_string):
try:
xmlDoc = parseString(rss_string).documentElement
except :
return {'title': 'Parsing RSS failed.' }
if(xmlDoc.tagName.startswith('rss') or xmlDoc.tagName.startswith('rdf') ):
feed_data = {}
RSSTitle = None
if (xmlDoc.getElementsByTagName('title') and xmlDoc.getElementsByTagName('title')[0].parentNode.tagName != 'item'):
feed_data['title'] = xmlDoc.getElementsByTagName('title')[0].firstChild.nodeValue
if (xmlDoc.getElementsByTagName('image') and xmlDoc.getElementsByTagName('image')[0].parentNode.tagName != 'item'):
logo = xmlDoc.getElementsByTagName('image')[0]
if (logo.getElementsByTagName('url')):
feed_data['logo'] = logo.getElementsByTagName('url')[0].firstChild.nodeValue
elif(logo.getElementsByTagName('rdf:resource')):
feed_data['logo'] = logo.getElementsByTagName('rdf:resource')[0].firstChild.nodeValue
if (xmlDoc.getElementsByTagName('link') and xmlDoc.getElementsByTagName('link')[0].parentNode.tagName != 'item'):
feed_data['link'] = xmlDoc.getElementsByTagName('link')[0].firstChild.nodeValue
item_list = xmlDoc.getElementsByTagName('item')
feed_data['items'] = []
for item in item_list:
message = {}
message['other_links'] = []
message['img'] = []
if(item.getElementsByTagName('title') and item.getElementsByTagName('title')[0].firstChild):
message['title'] = item.getElementsByTagName('title')[0].firstChild.nodeValue
if(item.getElementsByTagName('link') and item.getElementsByTagName('link')[0].firstChild):
message['link'] = item.getElementsByTagName('link')[0].firstChild.nodeValue
if(item.getElementsByTagName('description') and item.getElementsByTagName('description')[0].firstChild):
message['content'] = cleanHTML(item.getElementsByTagName('description')[0].firstChild.nodeValue)
if (item.getElementsByTagName('pubDate') and item.getElementsByTagName('pubDate')[0].firstChild):
message['date'] = item.getElementsByTagName('pubDate')[0].firstChild.nodeValue
elif(item.getElementsByTagName('dc:date') and item.getElementsByTagName('dc:date')[0].firstChild):
message['date'] = item.getElementsByTagName('dc:date')[0].firstChild.nodeValue
if (item.getElementsByTagName('enclosure')):
for enclosure in item.getElementsByTagName('enclosure'):
if (str(enclosure.attributes['type'].nodeValue).find('image') != -1):
message['img'].append(enclosure.attributes['url'].nodeValue)
else:
if (enclosure.attributes.has_key('title')):
message['other_links'].append('<a href="'+enclosure.attributes['url'].nodeValue+'"target="_blank">'+enclosure.attributes['url'].nodeValue+'</a>')
else:
message['other_links'].append('<a href="'+enclosure.attributes['url'].nodeValue+'"target="_blank">'+enclosure.attributes['title'].nodeValue+'</a>')
message['md5'] = md5.new(str(message)).hexdigest()
feed_data['items'].append(message)
elif(xmlDoc.tagName == 'feed'):
feed_data = {}
feedTitle = None
if (xmlDoc.getElementsByTagName('title') and xmlDoc.getElementsByTagName('title')[0].parentNode.tagName != 'entry'):
feed_data['title'] = xmlDoc.getElementsByTagName('title')[0].firstChild.nodeValue
if (xmlDoc.getElementsByTagName('icon') and xmlDoc.getElementsByTagName('icon')[0].parentNode.tagName != 'entry'):
feed_data['logo'] = xmlDoc.getElementsByTagName('icon')[0].firstChild.nodeValue
item_list = xmlDoc.getElementsByTagName('entry')
feed_data['items'] = []
for item in item_list:
message = {}
if(item.getElementsByTagName('title') and item.getElementsByTagName('title')[0].firstChild):
message['title'] = item.getElementsByTagName('title')[0].firstChild.nodeValue
message['other_links'] = []
message['img'] = []
for link in item.getElementsByTagName('link'):
if (link.attributes.has_key('rel') and link.attributes.get('rel').nodeValue == 'alternate'):
message['link'] = link.attributes['href'].nodeValue
elif (link.attributes.has_key('type') and link.attributes.get('type').nodeValue.find('image') != -1):
message['img'].append(link.attributes['href'].nodeValue)
else:
if (link.attributes.has_key('title')):
message['other_links'].append('<a href="'+link.attributes['href'].nodeValue+'" target="_blank">'+link.attributes['title'].nodeValue+'</a>')
else:
message['other_links'].append('<a href="'+link.attributes['href'].nodeValue+'"target="_blank">'+link.attributes['href'].nodeValue+'</a>')
if (item.getElementsByTagName('content') and item.getElementsByTagName('content')[0].firstChild):
message['content'] = stringConstructor(item.getElementsByTagName('content')[0])
elif (item.getElementsByTagName('summary') and item.getElementsByTagName('summary')[0].firstChild):
message['content'] = stringConstructor(item.getElementsByTagName('summary')[0])
if (item.getElementsByTagName('updated') and item.getElementsByTagName('updated')[0].firstChild):
message['date'] = item.getElementsByTagName('updated')[0].firstChild.nodeValue
elif (item.getElementsByTagName('modified') and item.getElementsByTagName('modified')[0].firstChild):
message['date'] = item.getElementsByTagName('modified')[0].firstChild.nodeValue
message['md5'] = md5.new(str(message)).hexdigest()
feed_data['items'].append(message)
else:
return {'title': 'This reader can\'t read this feed'}
return feed_data
class HTMLCleaner(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.html = ''
self.script = 0
def handle_starttag(self, tag, attrs):
if tag !='script' and tag !='input' and tag !='button' :
self.html += '<'+tag+' '
for attr in attrs:
if not attr[0].startswith('on'):
self.html += attr[0]+'=' +attr[1]+' '
if tag=='a':
self.html += 'target="_blank" '
self.html += '>'
else:
self.script = 1
def handle_data(self, data):
if not self.script:
self.html += data
def handle_charref(self, name):
self.html += '&#'+name+';'
def handle_entityref(self, name):
self.html += '&'+name+';'
def handle_endtag(self, tag):
if tag !='script' and tag !='input' and tag !='button' :
self.html += '</'+tag+'>'
else:
self.script = 0
def handle_startendtag(self, tag, attrs):
if tag !='script' and tag !='input' and tag !='button' :
self.html += '<'+tag+' '
for attr in attrs:
if not attr[0].startswith('on'):
self.html += attr[0]+'=' +attr[1]+' '
self.html += '/>'
def cleanHTML(string):
html = ''
parser= HTMLCleaner()
parser.feed(string)
return parser.html
def stringConstructor(domItem):
string = ''
for item in domItem.childNodes:
if item.nodeType == 3:
string = string + item.nodeValue
elif item.nodeType == 1 and item.tagName != 'script' and item.tagName != 'input' and item.tagName != 'button':
string = string + '<' + item.tagName + ' '
if item.attributes:
for att in item.attributes.items():
if(not att[0].startswith('on')):
string = string + att[0] + '=' + att[1] + ' '
if item.tagName == 'a':
string = string + 'target="_blank" '
string = string + '>'
string = string + stringConstructor(item)
string = string + '</' + item.tagName + '>'
return string
import feedparser, md5, urllib2, socket
def getRssDataAsDict(self, url, username=None, password=None):
result = {}
translate = self.Base_translateString
# no url, no feed to read
if url in ('', None, 'None',):
return {'title':translate('Please enter a valid Rss or Atom url in the preference form.')}
# use authentication or not?
handlers = []
if username is not None and password is not None:
passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
passman.add_password(None, url, username, password)
auth_handler = urllib2.HTTPBasicAuthHandler(passman)
handlers.append(auth_handler)
# set shorter timeouts and revert default at enf of read
default_timeout = socket.getdefaulttimeout()
socket.setdefaulttimeout(10.0)
d = feedparser.parse(url, handlers=handlers)
socket.setdefaulttimeout(default_timeout)
if d.bozo and isinstance(d.bozo_exception, urllib2.URLError):
# we have an URL error
return {'title':translate('Wrong Rss or Atom url or service temporary down.')}
# http status code checks
if d.status == 401:
return {'title': translate('Unauthorized, verify your authentication.')}
elif d.status == 404:
return {'title': translate('Page not found.')}
result['items'] = []
# some feeds may not provide logo
if d.feed.get('image', None) is not None:
result['logo'] = d.feed.image['href']
result['title'] = d.feed.title
result['link'] = d.feed.link
for entry in d.entries:
entry_dict = {}
entry_dict['title'] = entry['title']
entry_dict['link'] = entry['link']
entry_dict['other_links'] = [x['href'] for x in entry['links']]
entry_dict['md5'] = md5.new(entry['link']).hexdigest()
entry_dict['content'] = entry['summary']
entry_dict['date'] = entry['updated']
entry_dict['img'] = [x['href'] for x in entry.get('enclosures', [])]
entry_dict['updated_parsed'] = entry['updated_parsed']
result['items'].append(entry_dict)
# sort by date
result['items'] = sorted(result['items'], key=lambda k: k['updated_parsed'])
result['items'].reverse()
return result
\ No newline at end of file
......@@ -69,7 +69,7 @@ else:\n
feed_url = str(preferences.get(\'preferred_rss_feed\',\'\'))\n
username = str(preferences.get(\'preferred_username\',\'\'))\n
password = str(preferences.get(\'preferred_password\',\'\'))\n
results = context.Base_getRssDataAsDict(url = feed_url, username = username, password = password)\n
results = context.Base_getRssDataAsDict(context, url = feed_url, username = username, password = password)\n
readItemList = {}\n
md5_list = []\n
message_list = []\n
......
......@@ -57,7 +57,7 @@
tal:attributes="class python: test(is_read, \'teaser read\', \'teaser unread\');\n
onclick string:if(this.className!=\'teaser read\'){this.className=\'teaser read\';;MochiKit.Async.doSimpleXMLHttpRequest(\'Base_setRssItemReadInSelection\', {\'selection_name\':\'${selection_name}\',\'item\':\'${md5}\'})};;toggle(\'${dom_id}\');">\n
<img class="thumbnail-image"\n
tal:condition="python: image not in ((), None,)"\n
tal:condition="python: image not in ((), None, [])"\n
tal:attributes="src python: image[0]" alt="Thumbnail"/>\n
<span class="document-title"\n
tal:content="title"/> - \n
......
597
\ No newline at end of file
601
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment