4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
62 self.inheadingone = False
63 self.inheadingtwo = False
64 self.inotherheading = False
65 self.inparagraph = True
66 self.inblockquote = False
69 self.currentparagraph = u''
70 self.headingtext = u''
76 HTMLParser.__init__(self)
78 def handle_starttag(self, tag, attrs):
79 if tag.lower() == "h1":
80 self.inheadingone = True
81 self.inparagraph = False
82 elif tag.lower() == "h2":
83 self.inheadingtwo = True
84 self.inparagraph = False
85 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86 self.inotherheading = True
87 self.inparagraph = False
88 elif tag.lower() == "a":
90 elif tag.lower() == "br":
92 elif tag.lower() == "blockquote":
93 self.inblockquote = True
94 self.text = self.text + u'\n'
95 elif tag.lower() == "p":
97 self.text = self.text + u'\n\n'
99 self.text = self.text \
100 + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
101 self.currentparagraph = u''
102 self.inparagraph = True
103 elif tag.lower() == "pre":
104 self.text = self.text + "\n"
106 self.inparagraph = False
107 self.inblockquote = False
108 elif tag.lower() == "ul":
111 self.text = self.text + "\n"
112 elif tag.lower() == "li":
117 self.text = self.text \
119 + u'\n '.join([a.strip() for a in \
120 textwrap.wrap(self.item, 67)]) \
125 def handle_startendtag(self, tag, attrs):
126 if tag.lower() == "br":
131 self.text = self.text \
134 for a in textwrap.wrap( \
135 self.currentparagraph, 70) \
139 self.currentparagraph = u''
140 elif self.inblockquote:
141 self.text = self.text \
145 for a in textwrap.wrap( \
146 self.blockquote.encode("utf-8") \
151 self.blockquote = u''
153 self.text = self.text + "\n"
155 def handle_endtag(self, tag):
156 if tag.lower() == "h1":
157 self.inheadingone = False
158 self.text = self.text \
160 + self.headingtext.encode("utf-8") \
162 + u'=' * len(self.headingtext.encode("utf-8").strip())
163 self.headingtext = u''
164 elif tag.lower() == "h2":
165 self.inheadingtwo = False
166 self.text = self.text \
168 + self.headingtext.encode("utf-8") \
170 + u'-' * len(self.headingtext.encode("utf-8").strip())
171 self.headingtext = u''
172 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
173 self.inotherheading = False
174 self.text = self.text \
176 + self.headingtext.encode("utf-8") \
178 + u'~' * len(self.headingtext.encode("utf-8").strip())
179 self.headingtext = u''
180 elif tag.lower() == "p":
181 self.text = self.text \
182 + u'\n'.join(textwrap.wrap( \
183 self.currentparagraph, 70) \
185 self.inparagraph = False
186 self.currentparagraph = u''
187 elif tag.lower() == "blockquote":
188 self.text = self.text \
192 for a in textwrap.wrap( \
193 self.blockquote, 68)] \
196 self.inblockquote = False
197 self.blockquote = u''
198 elif tag.lower() == "pre":
200 elif tag.lower() == "li":
203 self.text = self.text \
206 [a.strip() for a in textwrap.wrap(self.item, 67)]) \
209 elif tag.lower() == "ul":
212 def handle_data(self, data):
213 if self.inheadingone or self.inheadingtwo or self.inotherheading:
214 self.headingtext = self.headingtext \
215 + unicode(data, "utf-8").strip() \
217 elif self.inblockquote:
218 self.blockquote = self.blockquote \
219 + unicode(data, "utf-8").strip() \
222 self.item = self.item + unicode(data, "utf-8")
223 elif self.inparagraph:
224 self.currentparagraph = self.currentparagraph \
225 + unicode(data, "utf-8").strip() \
228 self.text = self.text + unicode(data, "utf-8")
230 self.text = self.text + unicode(data, "utf-8").strip() + u' '
232 def handle_entityref(self, name):
234 if HTML2Text.entities.has_key(name.lower()):
235 entity = HTML2Text.entities[name.lower()]
237 entity = unichr(int(name[1:]))
239 entity = "&" + name + ";"
242 self.currentparagraph = self.currentparagraph \
243 + unicode(entity, "utf-8")
244 elif self.inblockquote:
245 self.blockquote = self.blockquote + unicode(entity, "utf-8")
247 self.text = self.text + unicode(entity, "utf-8")
252 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
257 def open_url(method, url):
259 while redirectcount < 3:
260 (type, rest) = urllib.splittype(url)
261 (host, path) = urllib.splithost(rest)
262 (host, port) = urllib.splitport(host)
266 conn = httplib.HTTPConnection("%s:%s" %(host, port))
267 conn.request(method, path)
268 response = conn.getresponse()
269 if response.status in [301, 302, 303, 307]:
270 headers = response.getheaders()
271 for header in headers:
272 if header[0] == "location":
274 elif response.status == 200:
278 redirectcount = redirectcount + 1
281 def parse_and_deliver(maildir, url, statedir):
284 # first check if we know about this feed already
285 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
286 if feeddb.has_key(url):
288 data = cgi.parse_qs(data)
289 response = open_url("HEAD", url)
292 headers = response.getheaders()
295 for header in headers:
296 if header[0] == "content-length":
297 if header[1] != data["content-length"][0]:
299 elif header[0] == "etag":
300 if header[1] != data["etag"][0]:
302 elif header[0] == "last-modified":
303 if header[1] != data["last-modified"][0]:
305 elif header[0] == "content-md5":
306 if header[1] != data["content-md5"][0]:
311 response = open_url("GET", url)
313 headers = response.getheaders()
314 feedhandle = response
316 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
319 return # don't need to do anything, nothings changed.
321 response = open_url("GET", url)
323 headers = response.getheaders()
324 feedhandle = response
326 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
329 fp = feedparser.parse(feedhandle)
330 db = dbm.open(os.path.join(statedir, "seen"), "c")
331 for item in fp["items"]:
332 # have we seen it before?
333 # need to work out what the content is first...
335 if item.has_key("content"):
336 content = item["content"][0]["value"]
338 content = item["summary"]
340 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
344 # check if there's a guid too - if that exists and we match the md5,
346 if item.has_key("guid"):
347 if db.has_key(url + "|" + item["guid"]):
348 data = db[url + "|" + item["guid"]]
349 data = cgi.parse_qs(data)
350 if data["contentmd5"][0] == md5sum:
353 if db.has_key(url + "|" + item["link"]):
354 data = db[url + "|" + item["link"]]
355 data = cgi.parse_qs(data)
356 if data.has_key("message-id"):
357 prevmessageid = data["message-id"][0]
358 if data["contentmd5"][0] == md5sum:
362 author = item["author"]
366 # create a basic email message
367 msg = MIMEMultipart("alternative")
369 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
373 string.ascii_letters + string.digits \
374 ) for a in range(0,6) \
375 ]) + "@" + socket.gethostname() + ">"
376 msg.add_header("Message-ID", messageid)
377 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
378 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
379 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
381 msg.add_header("References", prevmessageid)
382 createddate = datetime.datetime.now() \
383 .strftime("%a, %e %b %Y %T -0000")
385 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
386 .strftime("%a, %e %b %Y %T -0000")
389 msg.add_header("Date", createddate)
390 msg.add_header("Subject", item["title"])
391 msg.set_default_type("text/plain")
393 htmlcontent = content.encode("utf-8")
394 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
398 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
399 textparser = HTML2Text()
400 textparser.feed(content.encode("utf-8"))
401 textcontent = textparser.gettext()
402 textcontent = "%s\n\nItem URL: %s" %( \
405 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
409 # start by working out the filename we should be writting to, we do
410 # this following the normal maildir style rules
411 fname = str(os.getpid()) \
412 + "." + socket.gethostname() \
415 string.ascii_letters + string.digits \
416 ) for a in range(0,10) \
418 + datetime.datetime.now().strftime('%s')
419 fn = os.path.join(maildir, "tmp", fname)
421 fh.write(msg.as_string())
423 # now move it in to the new directory
424 newfn = os.path.join(maildir, "new", fname)
428 # now add to the database about the item
430 messageid = prevmessageid + " " + messageid
431 if item.has_key("guid") and item["guid"] != item["link"]:
432 data = urllib.urlencode(( \
433 ("message-id", messageid), \
434 ("created", createddate), \
435 ("contentmd5", md5sum) \
437 db[url + "|" + item["guid"]] = data
439 data = db[url + "|" + item["link"]]
440 data = cgi.parse_qs(data)
441 newdata = urllib.urlencode(( \
442 ("message-id", messageid), \
443 ("created", data["created"][0]), \
444 ("contentmd5", data["contentmd5"][0]) \
446 db[url + "|" + item["link"]] = newdata
448 db[url + "|" + item["link"]] = data
450 data = urllib.urlencode(( \
451 ("message-id", messageid), \
452 ("created", createddate), \
453 ("contentmd5", md5sum) \
455 db[url + "|" + item["link"]] = data
459 for header in headers:
460 if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
461 data.append((header[0], header[1]))
463 data = urllib.urlencode(data)
469 if __name__ == "__main__":
470 # This only gets executed if we really called the program
471 # first off, parse the command line arguments
473 oparser = OptionParser()
475 "-c", "--conf", dest="conf",
476 help="location of config file"
479 "-s", "--statedir", dest="statedir",
480 help="location of directory to store state in"
483 (options, args) = oparser.parse_args()
485 # check for the configfile
489 if options.conf != None:
490 # does the file exist?
492 os.stat(options.conf)
493 configfile = options.conf
495 # should exit here as the specified file doesn't exist
497 "Config file %s does not exist. Exiting.\n" %(options.conf,))
500 # check through the default locations
502 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
503 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
506 os.stat("/etc/rss2maildir.conf")
507 configfile = "/etc/rss2maildir.conf"
509 sys.stderr.write("No config file found. Exiting.\n")
512 # Right - if we've got this far, we've got a config file, now for the hard
515 scp = SafeConfigParser()
518 maildir_root = "RSSMaildir"
521 if options.statedir != None:
522 state_dir = options.statedir
524 mode = os.stat(state_dir)[stat.ST_MODE]
525 if not stat.S_ISDIR(mode):
527 "State directory (%s) is not a directory\n" %(state_dir))
530 # try to make the directory
534 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
536 elif scp.has_option("general", "state_dir"):
537 new_state_dir = scp.get("general", "state_dir")
539 mode = os.stat(state_dir)[stat.ST_MODE]
540 if not stat.S_ISDIR(mode):
542 "State directory (%s) is not a directory\n" %(state_dir))
547 os.mkdir(new_state_dir)
548 state_dir = new_state_dir
551 "Couldn't create state directory %s\n" %(new_state_dir))
555 mode = os.stat(state_dir)[stat.ST_MODE]
556 if not stat.S_ISDIR(mode):
558 "State directory %s is not a directory\n" %(state_dir))
565 "State directory %s could not be created\n" %(state_dir))
568 if scp.has_option("general", "maildir_root"):
569 maildir_root = scp.get("general", "maildir_root")
572 mode = os.stat(maildir_root)[stat.ST_MODE]
573 if not stat.S_ISDIR(mode):
575 "Maildir Root %s is not a directory\n" \
580 os.mkdir(maildir_root)
582 sys.stderr.write("Couldn't create Maildir Root %s\n" \
586 feeds = scp.sections()
588 feeds.remove("general")
592 for section in feeds:
593 # check if the directory exists
596 maildir = scp.get(section, "maildir")
600 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
601 maildir = os.path.join(maildir_root, maildir)
604 exists = os.stat(maildir)
605 if stat.S_ISDIR(exists[stat.ST_MODE]):
606 # check if there's a new, cur and tmp directory
608 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
610 os.mkdir(os.path.join(maildir, "cur"))
611 if not stat.S_ISDIR(mode):
612 sys.stderr.write("Broken maildir: %s\n" %(maildir))
614 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
616 os.mkdir(os.path.join(maildir, "tmp"))
617 if not stat.S_ISDIR(mode):
618 sys.stderr.write("Broken maildir: %s\n" %(maildir))
620 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
621 if not stat.S_ISDIR(mode):
622 sys.stderr.write("Broken maildir: %s\n" %(maildir))
624 os.mkdir(os.path.join(maildir, "new"))
626 sys.stderr.write("Broken maildir: %s\n" %(maildir))
631 sys.stderr.write("Couldn't create root maildir %s\n" \
635 os.mkdir(os.path.join(maildir, "new"))
636 os.mkdir(os.path.join(maildir, "cur"))
637 os.mkdir(os.path.join(maildir, "tmp"))
640 "Couldn't create required maildir directories for %s\n" \
644 # right - we've got the directories, we've got the section, we know the
647 parse_and_deliver(maildir, section, state_dir)