]> git.sommitrealweird.co.uk Git - rss2maildir.git/blobdiff - rss2maildir.py
* multiple posts with the same link but different guid support - still
[rss2maildir.git] / rss2maildir.py
index d8cd3e4b109d4af1035f1ffd3b4d7cd935da721e..6dad334c6008b24e67fcfbd822f1d6db15004b4d 100755 (executable)
@@ -53,7 +53,7 @@ entities = {
     "pound": "£",
     "copy": "©",
     "apos": "'",
     "pound": "£",
     "copy": "©",
     "apos": "'",
-    "quote": "\"",
+    "quot": "\"",
     "nbsp": " ",
     }
 
     "nbsp": " ",
     }
 
@@ -252,25 +252,42 @@ class HTML2Text(HTMLParser):
             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
         return data
 
             data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
         return data
 
+def open_url(method, url):
+    redirectcount = 0
+    while redirectcount < 3:
+        (type, rest) = urllib.splittype(url)
+        (host, path) = urllib.splithost(rest)
+        (host, port) = urllib.splitport(host)
+        if port == None:
+            port = 80
+        try:
+            conn = httplib.HTTPConnection("%s:%s" %(host, port))
+            conn.request(method, path)
+            response = conn.getresponse()
+            if response.status in [301, 302, 303, 307]:
+                headers = response.getheaders()
+                for header in headers:
+                    if header[0] == "location":
+                        url = header[1]
+            elif response.status == 200:
+                return response
+        except:
+            pass
+        redirectcount = redirectcount + 1
+    return None
+
 def parse_and_deliver(maildir, url, statedir):
     feedhandle = None
     headers = None
     # first check if we know about this feed already
     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
 def parse_and_deliver(maildir, url, statedir):
     feedhandle = None
     headers = None
     # first check if we know about this feed already
     feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
-    # we need all the parts of the url 
-    (type, rest) = urllib.splittype(url)
-    (host, path) = urllib.splithost(rest)
-    (host, port) = urllib.splitport(host)
-    if port == None:
-        port = 80
     if feeddb.has_key(url):
         data = feeddb[url]
         data = cgi.parse_qs(data)
     if feeddb.has_key(url):
         data = feeddb[url]
         data = cgi.parse_qs(data)
-        # now do a head on the feed to see if it's been updated
-        conn = httplib.HTTPConnection("%s:%s" %(host, port))
-        conn.request("HEAD", path)
-        response = conn.getresponse()
-        headers = response.getheaders()
+        response = open_url("HEAD", url)
+        headers = None
+        if response:
+            headers = response.getheaders()
         ischanged = False
         try:
             for header in headers:
         ischanged = False
         try:
             for header in headers:
@@ -289,24 +306,23 @@ def parse_and_deliver(maildir, url, statedir):
         except:
             ischanged = True
         if ischanged:
         except:
             ischanged = True
         if ischanged:
-            conn = httplib.HTTPConnection("%s:%s" %(host, port))
-            conn.request("GET", path)
-            response = conn.getresponse()
-            headers = response.getheaders()
-            feedhandle = response
+            response = open_url("GET", url)
+            if response != None:
+                headers = response.getheaders()
+                feedhandle = response
+            else:
+                sys.stderr.write("Failed to fetch feed: %s\n" %(url))
+                return
         else:
             return # don't need to do anything, nothings changed.
     else:
         else:
             return # don't need to do anything, nothings changed.
     else:
-        conn = httplib.HTTPConnection("%s:%s" %(host, port))
-        conn.request("GET", path)
-        response = None
-        try:
-            response = conn.getresponse()
-        except:
-            print "Failed to fetch feed: %s" %(url)
+        response = open_url("GET", url)
+        if response != None:
+            headers = response.getheaders()
+            feedhandle = response
+        else:
+            sys.stderr.write("Failed to fetch feed: %s\n" %(url))
             return
             return
-        headers = response.getheaders()
-        feedhandle = response
 
     fp = feedparser.parse(feedhandle)
     db = dbm.open(os.path.join(statedir, "seen"), "c")
 
     fp = feedparser.parse(feedhandle)
     db = dbm.open(os.path.join(statedir, "seen"), "c")
@@ -323,6 +339,15 @@ def parse_and_deliver(maildir, url, statedir):
 
         prevmessageid = None
 
 
         prevmessageid = None
 
+        # check if there's a guid too - if that exists and we match the md5,
+        # return
+        if item.has_key("guid"):
+            if db.has_key(url + "|" + item["guid"]):
+                data = db[url + "|" + item["guid"]]
+                data = cgi.parse_qs(data)
+                if data["contentmd5"][0] == md5sum:
+                    continue
+
         if db.has_key(url + "|" + item["link"]):
             data = db[url + "|" + item["link"]]
             data = cgi.parse_qs(data)
         if db.has_key(url + "|" + item["link"]):
             data = db[url + "|" + item["link"]]
             data = cgi.parse_qs(data)
@@ -393,12 +418,31 @@ def parse_and_deliver(maildir, url, statedir):
         # now add to the database about the item
         if prevmessageid:
             messageid = prevmessageid + " " + messageid
         # now add to the database about the item
         if prevmessageid:
             messageid = prevmessageid + " " + messageid
-        data = urllib.urlencode((
-            ("message-id", messageid), \
-            ("created", createddate), \
-            ("contentmd5", md5sum) \
-            ))
-        db[url + "|" + item["link"]] = data
+        if item.has_key("guid") and item["guid"] != item["link"]:
+            data = urllib.urlencode(( \
+                ("message-id", messageid), \
+                ("created", createddate), \
+                ("contentmd5", md5sum) \
+                ))
+            db[url + "|" + item["guid"]] = data
+            try:
+                data = db[url + "|" + item["link"]]
+                data = cgi.parse_qs(data)
+                newdata = urllib.urlencode(( \
+                    ("message-id", messageid), \
+                    ("created", data["created"][0]), \
+                    ("contentmd5", data["contentmd5"][0]) \
+                    ))
+                db[url + "|" + item["link"]] = newdata
+            except:
+                db[url + "|" + item["link"]] = data
+        else:
+            data = urllib.urlencode(( \
+                ("message-id", messageid), \
+                ("created", createddate), \
+                ("contentmd5", md5sum) \
+                ))
+            db[url + "|" + item["link"]] = data
 
     if headers:
         data = []
 
     if headers:
         data = []