4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
61 def __init__(self,textwidth=70):
62 self.inheadingone = False
63 self.inheadingtwo = False
64 self.inotherheading = False
65 self.inparagraph = True
66 self.inblockquote = False
69 self.currentparagraph = u''
70 self.headingtext = u''
76 self.textwidth = textwidth
77 HTMLParser.__init__(self)
79 def handle_starttag(self, tag, attrs):
80 if tag.lower() == "h1":
81 self.inheadingone = True
82 self.inparagraph = False
83 elif tag.lower() == "h2":
84 self.inheadingtwo = True
85 self.inparagraph = False
86 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
87 self.inotherheading = True
88 self.inparagraph = False
89 elif tag.lower() == "a":
91 elif tag.lower() == "br":
93 elif tag.lower() == "blockquote":
94 self.inblockquote = True
95 self.text = self.text + u'\n'
96 elif tag.lower() == "p":
98 self.text = self.text + u'\n\n'
100 self.text = self.text \
101 + u'\n'.join(textwrap.wrap(self.currentparagraph, self.textwidth))
102 self.currentparagraph = u''
103 self.inparagraph = True
104 elif tag.lower() == "pre":
105 self.text = self.text + "\n"
107 self.inparagraph = False
108 self.inblockquote = False
109 elif tag.lower() == "ul":
112 self.text = self.text + "\n"
113 elif tag.lower() == "li":
118 self.text = self.text \
120 + u'\n '.join([a.strip() for a in \
121 textwrap.wrap(self.item, self.textwidth - 3)]) \
126 def handle_startendtag(self, tag, attrs):
127 if tag.lower() == "br":
132 self.text = self.text \
135 for a in textwrap.wrap( \
136 self.currentparagraph, self.textwidth) \
140 self.currentparagraph = u''
141 elif self.inblockquote:
142 self.text = self.text \
146 for a in textwrap.wrap( \
147 self.blockquote.encode("utf-8") \
152 self.blockquote = u''
154 self.text = self.text + "\n"
156 def handle_endtag(self, tag):
157 if tag.lower() == "h1":
158 self.inheadingone = False
159 self.text = self.text \
161 + self.headingtext.encode("utf-8") \
163 + u'=' * len(self.headingtext.encode("utf-8").strip())
164 self.headingtext = u''
165 elif tag.lower() == "h2":
166 self.inheadingtwo = False
167 self.text = self.text \
169 + self.headingtext.encode("utf-8") \
171 + u'-' * len(self.headingtext.encode("utf-8").strip())
172 self.headingtext = u''
173 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
174 self.inotherheading = False
175 self.text = self.text \
177 + self.headingtext.encode("utf-8") \
179 + u'~' * len(self.headingtext.encode("utf-8").strip())
180 self.headingtext = u''
181 elif tag.lower() == "p":
182 self.text = self.text \
183 + u'\n'.join(textwrap.wrap( \
184 self.currentparagraph, self.textwidth) \
186 self.inparagraph = False
187 self.currentparagraph = u''
188 elif tag.lower() == "blockquote":
189 self.text = self.text \
193 for a in textwrap.wrap( \
194 self.blockquote, self.textwidth - 2)] \
197 self.inblockquote = False
198 self.blockquote = u''
199 elif tag.lower() == "pre":
201 elif tag.lower() == "li":
204 self.text = self.text \
207 [a.strip() for a in textwrap.wrap(self.item, self.textwidth - 3)]) \
210 elif tag.lower() == "ul":
213 def handle_data(self, data):
214 if self.inheadingone or self.inheadingtwo or self.inotherheading:
215 self.headingtext = self.headingtext \
216 + unicode(data, "utf-8").strip() \
218 elif self.inblockquote:
219 self.blockquote = self.blockquote \
220 + unicode(data, "utf-8").strip() \
223 self.item = self.item + unicode(data, "utf-8")
224 elif self.inparagraph:
225 self.currentparagraph = self.currentparagraph \
226 + unicode(data, "utf-8").strip() \
229 self.text = self.text + unicode(data, "utf-8")
231 isallwhitespace = data.strip() == ""
232 if not isallwhitespace:
233 self.text = self.text + unicode(data, "utf-8").strip() + u' '
235 def handle_entityref(self, name):
237 if HTML2Text.entities.has_key(name.lower()):
238 entity = HTML2Text.entities[name.lower()]
240 entity = unichr(int(name[1:]))
242 entity = "&" + name + ";"
245 self.currentparagraph = self.currentparagraph \
246 + unicode(entity, "utf-8")
247 elif self.inblockquote:
248 self.blockquote = self.blockquote + unicode(entity, "utf-8")
250 self.text = self.text + unicode(entity, "utf-8")
255 data = data + "\n".join(textwrap.wrap(self.currentparagraph, self.textwidth))
260 def open_url(method, url):
262 while redirectcount < 3:
263 (type, rest) = urllib.splittype(url)
264 (host, path) = urllib.splithost(rest)
265 (host, port) = urllib.splitport(host)
269 conn = httplib.HTTPConnection("%s:%s" %(host, port))
270 conn.request(method, path)
271 response = conn.getresponse()
272 if response.status in [301, 302, 303, 307]:
273 headers = response.getheaders()
274 for header in headers:
275 if header[0] == "location":
277 elif response.status == 200:
281 redirectcount = redirectcount + 1
284 def parse_and_deliver(maildir, url, statedir):
287 # first check if we know about this feed already
288 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
289 if feeddb.has_key(url):
291 data = cgi.parse_qs(data)
292 response = open_url("HEAD", url)
295 headers = response.getheaders()
298 for header in headers:
299 if header[0] == "content-length":
300 if header[1] != data["content-length"][0]:
302 elif header[0] == "etag":
303 if header[1] != data["etag"][0]:
305 elif header[0] == "last-modified":
306 if header[1] != data["last-modified"][0]:
308 elif header[0] == "content-md5":
309 if header[1] != data["content-md5"][0]:
314 response = open_url("GET", url)
316 headers = response.getheaders()
317 feedhandle = response
319 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
322 return # don't need to do anything, nothings changed.
324 response = open_url("GET", url)
326 headers = response.getheaders()
327 feedhandle = response
329 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
332 fp = feedparser.parse(feedhandle)
333 db = dbm.open(os.path.join(statedir, "seen"), "c")
334 for item in fp["items"]:
335 # have we seen it before?
336 # need to work out what the content is first...
338 if item.has_key("content"):
339 content = item["content"][0]["value"]
341 content = item["summary"]
343 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
347 # check if there's a guid too - if that exists and we match the md5,
349 if item.has_key("guid"):
350 if db.has_key(url + "|" + item["guid"]):
351 data = db[url + "|" + item["guid"]]
352 data = cgi.parse_qs(data)
353 if data["contentmd5"][0] == md5sum:
356 if db.has_key(url + "|" + item["link"]):
357 data = db[url + "|" + item["link"]]
358 data = cgi.parse_qs(data)
359 if data.has_key("message-id"):
360 prevmessageid = data["message-id"][0]
361 if data["contentmd5"][0] == md5sum:
365 author = item["author"]
369 # create a basic email message
370 msg = MIMEMultipart("alternative")
372 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
376 string.ascii_letters + string.digits \
377 ) for a in range(0,6) \
378 ]) + "@" + socket.gethostname() + ">"
379 msg.add_header("Message-ID", messageid)
380 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
381 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
382 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
384 msg.add_header("References", prevmessageid)
385 createddate = datetime.datetime.now() \
386 .strftime("%a, %e %b %Y %T -0000")
388 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
389 .strftime("%a, %e %b %Y %T -0000")
392 msg.add_header("Date", createddate)
393 msg.add_header("Subject", item["title"])
394 msg.set_default_type("text/plain")
396 htmlcontent = content.encode("utf-8")
397 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
401 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
402 textparser = HTML2Text()
403 textparser.feed(content.encode("utf-8"))
404 textcontent = textparser.gettext()
405 textcontent = "%s\n\nItem URL: %s" %( \
408 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
412 # start by working out the filename we should be writting to, we do
413 # this following the normal maildir style rules
414 fname = str(os.getpid()) \
415 + "." + socket.gethostname() \
418 string.ascii_letters + string.digits \
419 ) for a in range(0,10) \
421 + datetime.datetime.now().strftime('%s')
422 fn = os.path.join(maildir, "tmp", fname)
424 fh.write(msg.as_string())
426 # now move it in to the new directory
427 newfn = os.path.join(maildir, "new", fname)
431 # now add to the database about the item
433 messageid = prevmessageid + " " + messageid
434 if item.has_key("guid") and item["guid"] != item["link"]:
435 data = urllib.urlencode(( \
436 ("message-id", messageid), \
437 ("created", createddate), \
438 ("contentmd5", md5sum) \
440 db[url + "|" + item["guid"]] = data
442 data = db[url + "|" + item["link"]]
443 data = cgi.parse_qs(data)
444 newdata = urllib.urlencode(( \
445 ("message-id", messageid), \
446 ("created", data["created"][0]), \
447 ("contentmd5", data["contentmd5"][0]) \
449 db[url + "|" + item["link"]] = newdata
451 db[url + "|" + item["link"]] = data
453 data = urllib.urlencode(( \
454 ("message-id", messageid), \
455 ("created", createddate), \
456 ("contentmd5", md5sum) \
458 db[url + "|" + item["link"]] = data
462 for header in headers:
463 if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
464 data.append((header[0], header[1]))
466 data = urllib.urlencode(data)
472 if __name__ == "__main__":
473 # This only gets executed if we really called the program
474 # first off, parse the command line arguments
476 oparser = OptionParser()
478 "-c", "--conf", dest="conf",
479 help="location of config file"
482 "-s", "--statedir", dest="statedir",
483 help="location of directory to store state in"
486 (options, args) = oparser.parse_args()
488 # check for the configfile
492 if options.conf != None:
493 # does the file exist?
495 os.stat(options.conf)
496 configfile = options.conf
498 # should exit here as the specified file doesn't exist
500 "Config file %s does not exist. Exiting.\n" %(options.conf,))
503 # check through the default locations
505 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
506 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
509 os.stat("/etc/rss2maildir.conf")
510 configfile = "/etc/rss2maildir.conf"
512 sys.stderr.write("No config file found. Exiting.\n")
515 # Right - if we've got this far, we've got a config file, now for the hard
518 scp = SafeConfigParser()
521 maildir_root = "RSSMaildir"
524 if options.statedir != None:
525 state_dir = options.statedir
527 mode = os.stat(state_dir)[stat.ST_MODE]
528 if not stat.S_ISDIR(mode):
530 "State directory (%s) is not a directory\n" %(state_dir))
533 # try to make the directory
537 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
539 elif scp.has_option("general", "state_dir"):
540 new_state_dir = scp.get("general", "state_dir")
542 mode = os.stat(state_dir)[stat.ST_MODE]
543 if not stat.S_ISDIR(mode):
545 "State directory (%s) is not a directory\n" %(state_dir))
550 os.mkdir(new_state_dir)
551 state_dir = new_state_dir
554 "Couldn't create state directory %s\n" %(new_state_dir))
558 mode = os.stat(state_dir)[stat.ST_MODE]
559 if not stat.S_ISDIR(mode):
561 "State directory %s is not a directory\n" %(state_dir))
568 "State directory %s could not be created\n" %(state_dir))
571 if scp.has_option("general", "maildir_root"):
572 maildir_root = scp.get("general", "maildir_root")
575 mode = os.stat(maildir_root)[stat.ST_MODE]
576 if not stat.S_ISDIR(mode):
578 "Maildir Root %s is not a directory\n" \
583 os.mkdir(maildir_root)
585 sys.stderr.write("Couldn't create Maildir Root %s\n" \
589 feeds = scp.sections()
591 feeds.remove("general")
595 for section in feeds:
596 # check if the directory exists
599 maildir = scp.get(section, "maildir")
603 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
604 maildir = os.path.join(maildir_root, maildir)
607 exists = os.stat(maildir)
608 if stat.S_ISDIR(exists[stat.ST_MODE]):
609 # check if there's a new, cur and tmp directory
611 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
613 os.mkdir(os.path.join(maildir, "cur"))
614 if not stat.S_ISDIR(mode):
615 sys.stderr.write("Broken maildir: %s\n" %(maildir))
617 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
619 os.mkdir(os.path.join(maildir, "tmp"))
620 if not stat.S_ISDIR(mode):
621 sys.stderr.write("Broken maildir: %s\n" %(maildir))
623 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
624 if not stat.S_ISDIR(mode):
625 sys.stderr.write("Broken maildir: %s\n" %(maildir))
627 os.mkdir(os.path.join(maildir, "new"))
629 sys.stderr.write("Broken maildir: %s\n" %(maildir))
634 sys.stderr.write("Couldn't create root maildir %s\n" \
638 os.mkdir(os.path.join(maildir, "new"))
639 os.mkdir(os.path.join(maildir, "cur"))
640 os.mkdir(os.path.join(maildir, "tmp"))
643 "Couldn't create required maildir directories for %s\n" \
647 # right - we've got the directories, we've got the section, we know the
650 parse_and_deliver(maildir, section, state_dir)