X-Git-Url: https://git.sommitrealweird.co.uk/rss2maildir.git/blobdiff_plain/03e4620598f673f0148f8dd4041576fb8947469b..5b6a0aea2a9275597048c8c83597080fbe345056:/rss2maildir.py diff --git a/rss2maildir.py b/rss2maildir.py index 4402c43..6dad334 100755 --- a/rss2maildir.py +++ b/rss2maildir.py @@ -53,7 +53,7 @@ entities = { "pound": "£", "copy": "©", "apos": "'", - "quote": "\"", + "quot": "\"", "nbsp": " ", } @@ -252,25 +252,42 @@ class HTML2Text(HTMLParser): data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70)) return data +def open_url(method, url): + redirectcount = 0 + while redirectcount < 3: + (type, rest) = urllib.splittype(url) + (host, path) = urllib.splithost(rest) + (host, port) = urllib.splitport(host) + if port == None: + port = 80 + try: + conn = httplib.HTTPConnection("%s:%s" %(host, port)) + conn.request(method, path) + response = conn.getresponse() + if response.status in [301, 302, 303, 307]: + headers = response.getheaders() + for header in headers: + if header[0] == "location": + url = header[1] + elif response.status == 200: + return response + except: + pass + redirectcount = redirectcount + 1 + return None + def parse_and_deliver(maildir, url, statedir): feedhandle = None headers = None # first check if we know about this feed already feeddb = dbm.open(os.path.join(statedir, "feeds"), "c") - # we need all the parts of the url - (type, rest) = urllib.splittype(url) - (host, path) = urllib.splithost(rest) - (host, port) = urllib.splitport(host) - if port == None: - port = 80 if feeddb.has_key(url): data = feeddb[url] data = cgi.parse_qs(data) - # now do a head on the feed to see if it's been updated - conn = httplib.HTTPConnection("%s:%s" %(host, port)) - conn.request("HEAD", path) - response = conn.getresponse() - headers = response.getheaders() + response = open_url("HEAD", url) + headers = None + if response: + headers = response.getheaders() ischanged = False try: for header in headers: @@ -289,19 +306,23 @@ def parse_and_deliver(maildir, url, statedir): except: ischanged = True if ischanged: - conn = httplib.HTTPConnection("%s:%s" %(host, port)) - conn.request("GET", path) - response = conn.getresponse() - headers = response.getheaders() - feedhandle = response + response = open_url("GET", url) + if response != None: + headers = response.getheaders() + feedhandle = response + else: + sys.stderr.write("Failed to fetch feed: %s\n" %(url)) + return else: return # don't need to do anything, nothings changed. else: - conn = httplib.HTTPConnection("%s:%s" %(host, port)) - conn.request("GET", path) - response = conn.getresponse() - headers = response.getheaders() - feedhandle = response + response = open_url("GET", url) + if response != None: + headers = response.getheaders() + feedhandle = response + else: + sys.stderr.write("Failed to fetch feed: %s\n" %(url)) + return fp = feedparser.parse(feedhandle) db = dbm.open(os.path.join(statedir, "seen"), "c") @@ -318,6 +339,15 @@ def parse_and_deliver(maildir, url, statedir): prevmessageid = None + # check if there's a guid too - if that exists and we match the md5, + # return + if item.has_key("guid"): + if db.has_key(url + "|" + item["guid"]): + data = db[url + "|" + item["guid"]] + data = cgi.parse_qs(data) + if data["contentmd5"][0] == md5sum: + continue + if db.has_key(url + "|" + item["link"]): data = db[url + "|" + item["link"]] data = cgi.parse_qs(data) @@ -347,8 +377,13 @@ def parse_and_deliver(maildir, url, statedir): msg.add_header("To", "\"%s\" " %(url)) if prevmessageid: msg.add_header("References", prevmessageid) - createddate = datetime.datetime(*item["updated_parsed"][0:6]) \ + createddate = datetime.datetime.now() \ .strftime("%a, %e %b %Y %T -0000") + try: + createddate = datetime.datetime(*item["updated_parsed"][0:6]) \ + .strftime("%a, %e %b %Y %T -0000") + except: + pass msg.add_header("Date", createddate) msg.add_header("Subject", item["title"]) msg.set_default_type("text/plain") @@ -383,12 +418,31 @@ def parse_and_deliver(maildir, url, statedir): # now add to the database about the item if prevmessageid: messageid = prevmessageid + " " + messageid - data = urllib.urlencode(( - ("message-id", messageid), \ - ("created", createddate), \ - ("contentmd5", md5sum) \ - )) - db[url + "|" + item["link"]] = data + if item.has_key("guid") and item["guid"] != item["link"]: + data = urllib.urlencode(( \ + ("message-id", messageid), \ + ("created", createddate), \ + ("contentmd5", md5sum) \ + )) + db[url + "|" + item["guid"]] = data + try: + data = db[url + "|" + item["link"]] + data = cgi.parse_qs(data) + newdata = urllib.urlencode(( \ + ("message-id", messageid), \ + ("created", data["created"][0]), \ + ("contentmd5", data["contentmd5"][0]) \ + )) + db[url + "|" + item["link"]] = newdata + except: + db[url + "|" + item["link"]] = data + else: + data = urllib.urlencode(( \ + ("message-id", messageid), \ + ("created", createddate), \ + ("contentmd5", md5sum) \ + )) + db[url + "|" + item["link"]] = data if headers: data = []