X-Git-Url: https://git.sommitrealweird.co.uk/rss2maildir.git/blobdiff_plain/e3114c366ffcca31fedc33388dd887043e7e0af7..03e4620598f673f0148f8dd4041576fb8947469b:/rss2maildir.py diff --git a/rss2maildir.py b/rss2maildir.py index 268d192..4402c43 100755 --- a/rss2maildir.py +++ b/rss2maildir.py @@ -20,6 +20,7 @@ import sys import os import stat +import httplib import urllib import feedparser @@ -57,7 +58,7 @@ entities = { } class HTML2Text(HTMLParser): - + def __init__(self): self.inheadingone = False self.inheadingtwo = False @@ -88,23 +89,7 @@ class HTML2Text(HTMLParser): elif tag.lower() == "a": self.inlink = True elif tag.lower() == "br": - if self.inparagraph: - self.text = self.text \ - + u'\n'.join( \ - textwrap.wrap(self.currentparagraph, 70)) \ - + u'\n' - self.currentparagraph = "" - elif self.inblockquote: - self.text = self.text \ - + u'\n> ' \ - + u'\n> '.join( \ - [a.strip() \ - for a in textwrap.wrap(self.blockquote, 68) \ - ]) \ - + u'\n' - self.blockquote = u'' - else: - self.text = self.text + u'\n' + self.handle_br() elif tag.lower() == "blockquote": self.inblockquote = True self.text = self.text + u'\n' @@ -139,6 +124,9 @@ class HTML2Text(HTMLParser): def handle_startendtag(self, tag, attrs): if tag.lower() == "br": + self.handle_br() + + def handle_br(self): if self.inparagraph: self.text = self.text \ + u'\n'.join( \ @@ -265,7 +253,57 @@ class HTML2Text(HTMLParser): return data def parse_and_deliver(maildir, url, statedir): - fp = feedparser.parse(url) + feedhandle = None + headers = None + # first check if we know about this feed already + feeddb = dbm.open(os.path.join(statedir, "feeds"), "c") + # we need all the parts of the url + (type, rest) = urllib.splittype(url) + (host, path) = urllib.splithost(rest) + (host, port) = urllib.splitport(host) + if port == None: + port = 80 + if feeddb.has_key(url): + data = feeddb[url] + data = cgi.parse_qs(data) + # now do a head on the feed to see if it's been updated + conn = httplib.HTTPConnection("%s:%s" %(host, port)) + conn.request("HEAD", path) + response = conn.getresponse() + headers = response.getheaders() + ischanged = False + try: + for header in headers: + if header[0] == "content-length": + if header[1] != data["content-length"][0]: + ischanged = True + elif header[0] == "etag": + if header[1] != data["etag"][0]: + ischanged = True + elif header[0] == "last-modified": + if header[1] != data["last-modified"][0]: + ischanged = True + elif header[0] == "content-md5": + if header[1] != data["content-md5"][0]: + ischanged = True + except: + ischanged = True + if ischanged: + conn = httplib.HTTPConnection("%s:%s" %(host, port)) + conn.request("GET", path) + response = conn.getresponse() + headers = response.getheaders() + feedhandle = response + else: + return # don't need to do anything, nothings changed. + else: + conn = httplib.HTTPConnection("%s:%s" %(host, port)) + conn.request("GET", path) + response = conn.getresponse() + headers = response.getheaders() + feedhandle = response + + fp = feedparser.parse(feedhandle) db = dbm.open(os.path.join(statedir, "seen"), "c") for item in fp["items"]: # have we seen it before? @@ -278,9 +316,13 @@ def parse_and_deliver(maildir, url, statedir): md5sum = md5.md5(content.encode("utf-8")).hexdigest() + prevmessageid = None + if db.has_key(url + "|" + item["link"]): data = db[url + "|" + item["link"]] data = cgi.parse_qs(data) + if data.has_key("message-id"): + prevmessageid = data["message-id"][0] if data["contentmd5"][0] == md5sum: continue @@ -303,6 +345,8 @@ def parse_and_deliver(maildir, url, statedir): msg.set_unixfrom("\"%s\" " %(url)) msg.add_header("From", "\"%s\" " %(author)) msg.add_header("To", "\"%s\" " %(url)) + if prevmessageid: + msg.add_header("References", prevmessageid) createddate = datetime.datetime(*item["updated_parsed"][0:6]) \ .strftime("%a, %e %b %Y %T -0000") msg.add_header("Date", createddate) @@ -337,6 +381,8 @@ def parse_and_deliver(maildir, url, statedir): os.unlink(fn) # now add to the database about the item + if prevmessageid: + messageid = prevmessageid + " " + messageid data = urllib.urlencode(( ("message-id", messageid), \ ("created", createddate), \ @@ -344,7 +390,17 @@ def parse_and_deliver(maildir, url, statedir): )) db[url + "|" + item["link"]] = data + if headers: + data = [] + for header in headers: + if header[0] in ["content-md5", "etag", "last-modified", "content-length"]: + data.append((header[0], header[1])) + if len(data) > 0: + data = urllib.urlencode(data) + feeddb[url] = data + db.close() + feeddb.close() # first off, parse the command line arguments