From 7dc8a4550f4f01b8c41273576de1b9618827fb96 Mon Sep 17 00:00:00 2001 From: Brett Parker Date: Sat, 22 Dec 2007 01:02:37 +0000 Subject: [PATCH] * Add redirect support * Try to get a URL 3 times (redirects are included in the count...) * Refactor connection creation in to it's own function to lower duplication of code --- TODO | 1 - rss2maildir.py | 66 +++++++++++++++++++++++++++++++------------------- 2 files changed, 41 insertions(+), 26 deletions(-) diff --git a/TODO b/TODO index 1f14187..aa98d4d 100644 --- a/TODO +++ b/TODO @@ -1,4 +1,3 @@ -* follow 302 * add ol and dl support * add multilevel list support * normalise the number of spaces in formatted text diff --git a/rss2maildir.py b/rss2maildir.py index 32a39f6..aa3b0cc 100755 --- a/rss2maildir.py +++ b/rss2maildir.py @@ -252,25 +252,42 @@ class HTML2Text(HTMLParser): data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70)) return data +def open_url(method, url): + redirectcount = 0 + while redirectcount < 3: + (type, rest) = urllib.splittype(url) + (host, path) = urllib.splithost(rest) + (host, port) = urllib.splitport(host) + if port == None: + port = 80 + try: + conn = httplib.HTTPConnection("%s:%s" %(host, port)) + conn.request(method, path) + response = conn.getresponse() + if response.status in [301, 302, 303, 307]: + headers = response.getheaders() + for header in headers: + if header[0] == "location": + url = header[1] + elif response.status == 200: + return response + except: + pass + redirectcount = redirectcount + 1 + return None + def parse_and_deliver(maildir, url, statedir): feedhandle = None headers = None # first check if we know about this feed already feeddb = dbm.open(os.path.join(statedir, "feeds"), "c") - # we need all the parts of the url - (type, rest) = urllib.splittype(url) - (host, path) = urllib.splithost(rest) - (host, port) = urllib.splitport(host) - if port == None: - port = 80 if feeddb.has_key(url): data = feeddb[url] data = cgi.parse_qs(data) - # now do a head on the feed to see if it's been updated - conn = httplib.HTTPConnection("%s:%s" %(host, port)) - conn.request("HEAD", path) - response = conn.getresponse() - headers = response.getheaders() + response = open_url("HEAD", url) + headers = None + if response: + headers = response.getheaders() ischanged = False try: for header in headers: @@ -289,24 +306,23 @@ def parse_and_deliver(maildir, url, statedir): except: ischanged = True if ischanged: - conn = httplib.HTTPConnection("%s:%s" %(host, port)) - conn.request("GET", path) - response = conn.getresponse() - headers = response.getheaders() - feedhandle = response + response = open_url("GET", url) + if response != None: + headers = response.getheaders() + feedhandle = response + else: + sys.stderr.write("Failed to fetch feed: %s\n" %(url)) + return else: return # don't need to do anything, nothings changed. else: - conn = httplib.HTTPConnection("%s:%s" %(host, port)) - conn.request("GET", path) - response = None - try: - response = conn.getresponse() - except: - print "Failed to fetch feed: %s" %(url) + response = open_url("GET", url) + if response != None: + headers = response.getheaders() + feedhandle = response + else: + sys.stderr.write("Failed to fetch feed: %s\n" %(url)) return - headers = response.getheaders() - feedhandle = response fp = feedparser.parse(feedhandle) db = dbm.open(os.path.join(statedir, "seen"), "c") -- 2.30.2