4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
62 self.inheadingone = False
63 self.inheadingtwo = False
64 self.inotherheading = False
65 self.inparagraph = True
66 self.inblockquote = False
69 self.currentparagraph = u''
70 self.headingtext = u''
76 HTMLParser.__init__(self)
78 def handle_starttag(self, tag, attrs):
79 if tag.lower() == "h1":
80 self.inheadingone = True
81 self.inparagraph = False
82 elif tag.lower() == "h2":
83 self.inheadingtwo = True
84 self.inparagraph = False
85 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86 self.inotherheading = True
87 self.inparagraph = False
88 elif tag.lower() == "a":
90 elif tag.lower() == "br":
92 elif tag.lower() == "blockquote":
93 self.inblockquote = True
94 self.text = self.text + u'\n'
95 elif tag.lower() == "p":
97 self.text = self.text + u'\n\n'
99 self.text = self.text \
100 + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
101 self.currentparagraph = u''
102 self.inparagraph = True
103 elif tag.lower() == "pre":
104 self.text = self.text + "\n"
106 self.inparagraph = False
107 self.inblockquote = False
108 elif tag.lower() == "ul":
111 self.text = self.text + "\n"
112 elif tag.lower() == "li":
117 self.text = self.text \
119 + u'\n '.join([a.strip() for a in \
120 textwrap.wrap(self.item, 67)]) \
125 def handle_startendtag(self, tag, attrs):
126 if tag.lower() == "br":
131 self.text = self.text \
134 for a in textwrap.wrap( \
135 self.currentparagraph, 70) \
139 self.currentparagraph = u''
140 elif self.inblockquote:
141 self.text = self.text \
145 for a in textwrap.wrap( \
146 self.blockquote.encode("utf-8") \
151 self.blockquote = u''
153 self.text = self.text + "\n"
155 def handle_endtag(self, tag):
156 if tag.lower() == "h1":
157 self.inheadingone = False
158 self.text = self.text \
160 + self.headingtext.encode("utf-8") \
162 + u'=' * len(self.headingtext.encode("utf-8").strip())
163 self.headingtext = u''
164 elif tag.lower() == "h2":
165 self.inheadingtwo = False
166 self.text = self.text \
168 + self.headingtext.encode("utf-8") \
170 + u'-' * len(self.headingtext.encode("utf-8").strip())
171 self.headingtext = u''
172 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
173 self.inotherheading = False
174 self.text = self.text \
176 + self.headingtext.encode("utf-8") \
178 + u'~' * len(self.headingtext.encode("utf-8").strip())
179 self.headingtext = u''
180 elif tag.lower() == "p":
181 self.text = self.text \
182 + u'\n'.join(textwrap.wrap( \
183 self.currentparagraph, 70) \
185 self.inparagraph = False
186 self.currentparagraph = u''
187 elif tag.lower() == "blockquote":
188 self.text = self.text \
192 for a in textwrap.wrap( \
193 self.blockquote, 68)] \
196 self.inblockquote = False
197 self.blockquote = u''
198 elif tag.lower() == "pre":
200 elif tag.lower() == "li":
203 self.text = self.text \
206 [a.strip() for a in textwrap.wrap(self.item, 67)]) \
209 elif tag.lower() == "ul":
212 def handle_data(self, data):
213 if self.inheadingone or self.inheadingtwo or self.inotherheading:
214 self.headingtext = self.headingtext \
215 + unicode(data, "utf-8").strip() \
217 elif self.inblockquote:
218 self.blockquote = self.blockquote \
219 + unicode(data, "utf-8").strip() \
222 self.item = self.item + unicode(data, "utf-8")
223 elif self.inparagraph:
224 self.currentparagraph = self.currentparagraph \
225 + unicode(data, "utf-8").strip() \
228 self.text = self.text + unicode(data, "utf-8")
230 isallwhitespace = data.strip() == ""
231 if not isallwhitespace:
232 self.text = self.text + unicode(data, "utf-8").strip() + u' '
234 def handle_entityref(self, name):
236 if HTML2Text.entities.has_key(name.lower()):
237 entity = HTML2Text.entities[name.lower()]
239 entity = unichr(int(name[1:]))
241 entity = "&" + name + ";"
244 self.currentparagraph = self.currentparagraph \
245 + unicode(entity, "utf-8")
246 elif self.inblockquote:
247 self.blockquote = self.blockquote + unicode(entity, "utf-8")
249 self.text = self.text + unicode(entity, "utf-8")
254 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
259 def open_url(method, url):
261 while redirectcount < 3:
262 (type, rest) = urllib.splittype(url)
263 (host, path) = urllib.splithost(rest)
264 (host, port) = urllib.splitport(host)
268 conn = httplib.HTTPConnection("%s:%s" %(host, port))
269 conn.request(method, path)
270 response = conn.getresponse()
271 if response.status in [301, 302, 303, 307]:
272 headers = response.getheaders()
273 for header in headers:
274 if header[0] == "location":
276 elif response.status == 200:
280 redirectcount = redirectcount + 1
283 def parse_and_deliver(maildir, url, statedir):
286 # first check if we know about this feed already
287 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
288 if feeddb.has_key(url):
290 data = cgi.parse_qs(data)
291 response = open_url("HEAD", url)
294 headers = response.getheaders()
297 for header in headers:
298 if header[0] == "content-length":
299 if header[1] != data["content-length"][0]:
301 elif header[0] == "etag":
302 if header[1] != data["etag"][0]:
304 elif header[0] == "last-modified":
305 if header[1] != data["last-modified"][0]:
307 elif header[0] == "content-md5":
308 if header[1] != data["content-md5"][0]:
313 response = open_url("GET", url)
315 headers = response.getheaders()
316 feedhandle = response
318 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
321 return # don't need to do anything, nothings changed.
323 response = open_url("GET", url)
325 headers = response.getheaders()
326 feedhandle = response
328 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
331 fp = feedparser.parse(feedhandle)
332 db = dbm.open(os.path.join(statedir, "seen"), "c")
333 for item in fp["items"]:
334 # have we seen it before?
335 # need to work out what the content is first...
337 if item.has_key("content"):
338 content = item["content"][0]["value"]
340 content = item["summary"]
342 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
346 # check if there's a guid too - if that exists and we match the md5,
348 if item.has_key("guid"):
349 if db.has_key(url + "|" + item["guid"]):
350 data = db[url + "|" + item["guid"]]
351 data = cgi.parse_qs(data)
352 if data["contentmd5"][0] == md5sum:
355 if db.has_key(url + "|" + item["link"]):
356 data = db[url + "|" + item["link"]]
357 data = cgi.parse_qs(data)
358 if data.has_key("message-id"):
359 prevmessageid = data["message-id"][0]
360 if data["contentmd5"][0] == md5sum:
364 author = item["author"]
368 # create a basic email message
369 msg = MIMEMultipart("alternative")
371 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
375 string.ascii_letters + string.digits \
376 ) for a in range(0,6) \
377 ]) + "@" + socket.gethostname() + ">"
378 msg.add_header("Message-ID", messageid)
379 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
380 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
381 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
383 msg.add_header("References", prevmessageid)
384 createddate = datetime.datetime.now() \
385 .strftime("%a, %e %b %Y %T -0000")
387 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
388 .strftime("%a, %e %b %Y %T -0000")
391 msg.add_header("Date", createddate)
392 msg.add_header("Subject", item["title"])
393 msg.set_default_type("text/plain")
395 htmlcontent = content.encode("utf-8")
396 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
400 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
401 textparser = HTML2Text()
402 textparser.feed(content.encode("utf-8"))
403 textcontent = textparser.gettext()
404 textcontent = "%s\n\nItem URL: %s" %( \
407 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
411 # start by working out the filename we should be writting to, we do
412 # this following the normal maildir style rules
413 fname = str(os.getpid()) \
414 + "." + socket.gethostname() \
417 string.ascii_letters + string.digits \
418 ) for a in range(0,10) \
420 + datetime.datetime.now().strftime('%s')
421 fn = os.path.join(maildir, "tmp", fname)
423 fh.write(msg.as_string())
425 # now move it in to the new directory
426 newfn = os.path.join(maildir, "new", fname)
430 # now add to the database about the item
432 messageid = prevmessageid + " " + messageid
433 if item.has_key("guid") and item["guid"] != item["link"]:
434 data = urllib.urlencode(( \
435 ("message-id", messageid), \
436 ("created", createddate), \
437 ("contentmd5", md5sum) \
439 db[url + "|" + item["guid"]] = data
441 data = db[url + "|" + item["link"]]
442 data = cgi.parse_qs(data)
443 newdata = urllib.urlencode(( \
444 ("message-id", messageid), \
445 ("created", data["created"][0]), \
446 ("contentmd5", data["contentmd5"][0]) \
448 db[url + "|" + item["link"]] = newdata
450 db[url + "|" + item["link"]] = data
452 data = urllib.urlencode(( \
453 ("message-id", messageid), \
454 ("created", createddate), \
455 ("contentmd5", md5sum) \
457 db[url + "|" + item["link"]] = data
461 for header in headers:
462 if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
463 data.append((header[0], header[1]))
465 data = urllib.urlencode(data)
471 if __name__ == "__main__":
472 # This only gets executed if we really called the program
473 # first off, parse the command line arguments
475 oparser = OptionParser()
477 "-c", "--conf", dest="conf",
478 help="location of config file"
481 "-s", "--statedir", dest="statedir",
482 help="location of directory to store state in"
485 (options, args) = oparser.parse_args()
487 # check for the configfile
491 if options.conf != None:
492 # does the file exist?
494 os.stat(options.conf)
495 configfile = options.conf
497 # should exit here as the specified file doesn't exist
499 "Config file %s does not exist. Exiting.\n" %(options.conf,))
502 # check through the default locations
504 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
505 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
508 os.stat("/etc/rss2maildir.conf")
509 configfile = "/etc/rss2maildir.conf"
511 sys.stderr.write("No config file found. Exiting.\n")
514 # Right - if we've got this far, we've got a config file, now for the hard
517 scp = SafeConfigParser()
520 maildir_root = "RSSMaildir"
523 if options.statedir != None:
524 state_dir = options.statedir
526 mode = os.stat(state_dir)[stat.ST_MODE]
527 if not stat.S_ISDIR(mode):
529 "State directory (%s) is not a directory\n" %(state_dir))
532 # try to make the directory
536 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
538 elif scp.has_option("general", "state_dir"):
539 new_state_dir = scp.get("general", "state_dir")
541 mode = os.stat(state_dir)[stat.ST_MODE]
542 if not stat.S_ISDIR(mode):
544 "State directory (%s) is not a directory\n" %(state_dir))
549 os.mkdir(new_state_dir)
550 state_dir = new_state_dir
553 "Couldn't create state directory %s\n" %(new_state_dir))
557 mode = os.stat(state_dir)[stat.ST_MODE]
558 if not stat.S_ISDIR(mode):
560 "State directory %s is not a directory\n" %(state_dir))
567 "State directory %s could not be created\n" %(state_dir))
570 if scp.has_option("general", "maildir_root"):
571 maildir_root = scp.get("general", "maildir_root")
574 mode = os.stat(maildir_root)[stat.ST_MODE]
575 if not stat.S_ISDIR(mode):
577 "Maildir Root %s is not a directory\n" \
582 os.mkdir(maildir_root)
584 sys.stderr.write("Couldn't create Maildir Root %s\n" \
588 feeds = scp.sections()
590 feeds.remove("general")
594 for section in feeds:
595 # check if the directory exists
598 maildir = scp.get(section, "maildir")
602 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
603 maildir = os.path.join(maildir_root, maildir)
606 exists = os.stat(maildir)
607 if stat.S_ISDIR(exists[stat.ST_MODE]):
608 # check if there's a new, cur and tmp directory
610 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
612 os.mkdir(os.path.join(maildir, "cur"))
613 if not stat.S_ISDIR(mode):
614 sys.stderr.write("Broken maildir: %s\n" %(maildir))
616 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
618 os.mkdir(os.path.join(maildir, "tmp"))
619 if not stat.S_ISDIR(mode):
620 sys.stderr.write("Broken maildir: %s\n" %(maildir))
622 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
623 if not stat.S_ISDIR(mode):
624 sys.stderr.write("Broken maildir: %s\n" %(maildir))
626 os.mkdir(os.path.join(maildir, "new"))
628 sys.stderr.write("Broken maildir: %s\n" %(maildir))
633 sys.stderr.write("Couldn't create root maildir %s\n" \
637 os.mkdir(os.path.join(maildir, "new"))
638 os.mkdir(os.path.join(maildir, "cur"))
639 os.mkdir(os.path.join(maildir, "tmp"))
642 "Couldn't create required maildir directories for %s\n" \
646 # right - we've got the directories, we've got the section, we know the
649 parse_and_deliver(maildir, section, state_dir)