4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
60 class HTML2Text(HTMLParser):
63 self.inheadingone = False
64 self.inheadingtwo = False
65 self.inotherheading = False
66 self.inparagraph = True
67 self.inblockquote = False
70 self.currentparagraph = u''
71 self.headingtext = u''
77 HTMLParser.__init__(self)
79 def handle_starttag(self, tag, attrs):
80 if tag.lower() == "h1":
81 self.inheadingone = True
82 self.inparagraph = False
83 elif tag.lower() == "h2":
84 self.inheadingtwo = True
85 self.inparagraph = False
86 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
87 self.inotherheading = True
88 self.inparagraph = False
89 elif tag.lower() == "a":
91 elif tag.lower() == "br":
93 elif tag.lower() == "blockquote":
94 self.inblockquote = True
95 self.text = self.text + u'\n'
96 elif tag.lower() == "p":
98 self.text = self.text + u'\n\n'
100 self.text = self.text \
101 + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
102 self.currentparagraph = u''
103 self.inparagraph = True
104 elif tag.lower() == "pre":
105 self.text = self.text + "\n"
107 self.inparagraph = False
108 self.inblockquote = False
109 elif tag.lower() == "ul":
112 self.text = self.text + "\n"
113 elif tag.lower() == "li" and self.inul:
118 self.text = self.text \
120 + u'\n '.join([a.strip() for a in \
121 textwrap.wrap(self.item, 67)]) \
125 def handle_startendtag(self, tag, attrs):
126 if tag.lower() == "br":
131 self.text = self.text \
134 for a in textwrap.wrap( \
135 self.currentparagraph, 70) \
139 self.currentparagraph = u''
140 elif self.inblockquote:
141 self.text = self.text \
145 for a in textwrap.wrap( \
146 self.blockquote.encode("utf-8") \
151 self.blockquote = u''
153 self.text = self.text + "\n"
155 def handle_endtag(self, tag):
156 if tag.lower() == "h1":
157 self.inheadingone = False
158 self.text = self.text \
160 + self.headingtext.encode("utf-8") \
162 + u'=' * len(self.headingtext.encode("utf-8").strip())
163 self.headingtext = u''
164 elif tag.lower() == "h2":
165 self.inheadingtwo = False
166 self.text = self.text \
168 + self.headingtext.encode("utf-8") \
170 + u'-' * len(self.headingtext.encode("utf-8").strip())
171 self.headingtext = u''
172 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
173 self.inotherheading = False
174 self.text = self.text \
176 + self.headingtext.encode("utf-8") \
178 + u'~' * len(self.headingtext.encode("utf-8").strip())
179 self.headingtext = u''
180 elif tag.lower() == "p":
181 self.text = self.text \
182 + u'\n'.join(textwrap.wrap( \
183 self.currentparagraph, 70) \
185 self.inparagraph = False
186 self.currentparagraph = u''
187 elif tag.lower() == "blockquote":
188 self.text = self.text \
192 for a in textwrap.wrap( \
193 self.blockquote, 68)] \
196 self.inblockquote = False
197 self.blockquote = u''
198 elif tag.lower() == "pre":
200 elif tag.lower() == "li":
203 self.text = self.text \
206 [a.strip() for a in textwrap.wrap(self.item, 67)]) \
209 elif tag.lower() == "ul":
212 def handle_data(self, data):
213 if self.inheadingone or self.inheadingtwo or self.inotherheading:
214 self.headingtext = self.headingtext \
215 + unicode(data, "utf-8").strip() \
217 elif self.inblockquote:
218 self.blockquote = self.blockquote \
219 + unicode(data, "utf-8").strip() \
221 elif self.inparagraph:
222 self.currentparagraph = self.currentparagraph \
223 + unicode(data, "utf-8").strip() \
225 elif self.inul and self.initem:
226 self.item = self.item + unicode(data, "utf-8")
228 self.text = self.text + unicode(data, "utf-8")
230 self.text = self.text + unicode(data, "utf-8").strip() + u' '
232 def handle_entityref(self, name):
234 if entities.has_key(name.lower()):
235 entity = entities[name.lower()]
237 entity = unichr(int(name[1:]))
239 entity = "&" + name + ";"
242 self.currentparagraph = self.currentparagraph \
243 + unicode(entity, "utf-8")
244 elif self.inblockquote:
245 self.blockquote = self.blockquote + unicode(entity, "utf-8")
247 self.text = self.text + unicode(entity, "utf-8")
252 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
255 def parse_and_deliver(maildir, url, statedir):
258 # first check if we know about this feed already
259 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
260 # we need all the parts of the url
261 (type, rest) = urllib.splittype(url)
262 (host, path) = urllib.splithost(rest)
263 (host, port) = urllib.splitport(host)
266 if feeddb.has_key(url):
268 data = cgi.parse_qs(data)
269 # now do a head on the feed to see if it's been updated
270 conn = httplib.HTTPConnection("%s:%s" %(host, port))
271 conn.request("HEAD", path)
272 response = conn.getresponse()
273 headers = response.getheaders()
276 for header in headers:
277 if header[0] == "content-length":
278 if header[1] != data["content-length"][0]:
280 elif header[0] == "etag":
281 if header[1] != data["etag"][0]:
283 elif header[0] == "last-modified":
284 if header[1] != data["last-modified"][0]:
286 elif header[0] == "content-md5":
287 if header[1] != data["content-md5"][0]:
292 conn = httplib.HTTPConnection("%s:%s" %(host, port))
293 conn.request("GET", path)
294 response = conn.getresponse()
295 headers = response.getheaders()
296 feedhandle = response
298 return # don't need to do anything, nothings changed.
300 conn = httplib.HTTPConnection("%s:%s" %(host, port))
301 conn.request("GET", path)
304 response = conn.getresponse()
306 print "Failed to fetch feed: %s" %(url)
308 headers = response.getheaders()
309 feedhandle = response
311 fp = feedparser.parse(feedhandle)
312 db = dbm.open(os.path.join(statedir, "seen"), "c")
313 for item in fp["items"]:
314 # have we seen it before?
315 # need to work out what the content is first...
317 if item.has_key("content"):
318 content = item["content"][0]["value"]
320 content = item["summary"]
322 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
326 if db.has_key(url + "|" + item["link"]):
327 data = db[url + "|" + item["link"]]
328 data = cgi.parse_qs(data)
329 if data.has_key("message-id"):
330 prevmessageid = data["message-id"][0]
331 if data["contentmd5"][0] == md5sum:
335 author = item["author"]
339 # create a basic email message
340 msg = MIMEMultipart("alternative")
342 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
346 string.ascii_letters + string.digits \
347 ) for a in range(0,6) \
348 ]) + "@" + socket.gethostname() + ">"
349 msg.add_header("Message-ID", messageid)
350 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
351 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
352 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
354 msg.add_header("References", prevmessageid)
355 createddate = datetime.datetime.now() \
356 .strftime("%a, %e %b %Y %T -0000")
358 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
359 .strftime("%a, %e %b %Y %T -0000")
362 msg.add_header("Date", createddate)
363 msg.add_header("Subject", item["title"])
364 msg.set_default_type("text/plain")
366 htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
367 textparser = HTML2Text()
368 textparser.feed(content.encode("utf-8"))
369 textcontent = textparser.gettext()
370 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
374 # start by working out the filename we should be writting to, we do
375 # this following the normal maildir style rules
376 fname = str(os.getpid()) \
377 + "." + socket.gethostname() \
380 string.ascii_letters + string.digits \
381 ) for a in range(0,10) \
383 + datetime.datetime.now().strftime('%s')
384 fn = os.path.join(maildir, "tmp", fname)
386 fh.write(msg.as_string())
388 # now move it in to the new directory
389 newfn = os.path.join(maildir, "new", fname)
393 # now add to the database about the item
395 messageid = prevmessageid + " " + messageid
396 data = urllib.urlencode((
397 ("message-id", messageid), \
398 ("created", createddate), \
399 ("contentmd5", md5sum) \
401 db[url + "|" + item["link"]] = data
405 for header in headers:
406 if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
407 data.append((header[0], header[1]))
409 data = urllib.urlencode(data)
415 # first off, parse the command line arguments
417 oparser = OptionParser()
419 "-c", "--conf", dest="conf",
420 help="location of config file"
423 "-s", "--statedir", dest="statedir",
424 help="location of directory to store state in"
427 (options, args) = oparser.parse_args()
429 # check for the configfile
433 if options.conf != None:
434 # does the file exist?
436 os.stat(options.conf)
437 configfile = options.conf
439 # should exit here as the specified file doesn't exist
441 "Config file %s does not exist. Exiting.\n" %(options.conf,))
444 # check through the default locations
446 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
447 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
450 os.stat("/etc/rss2maildir.conf")
451 configfile = "/etc/rss2maildir.conf"
453 sys.stderr.write("No config file found. Exiting.\n")
456 # Right - if we've got this far, we've got a config file, now for the hard
459 scp = SafeConfigParser()
462 maildir_root = "RSSMaildir"
465 if options.statedir != None:
466 state_dir = options.statedir
468 mode = os.stat(state_dir)[stat.ST_MODE]
469 if not stat.S_ISDIR(mode):
471 "State directory (%s) is not a directory\n" %(state_dir))
474 # try to make the directory
478 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
480 elif scp.has_option("general", "state_dir"):
481 new_state_dir = scp.get("general", "state_dir")
483 mode = os.stat(state_dir)[stat.ST_MODE]
484 if not stat.S_ISDIR(mode):
486 "State directory (%s) is not a directory\n" %(state_dir))
491 os.mkdir(new_state_dir)
492 state_dir = new_state_dir
495 "Couldn't create state directory %s\n" %(new_state_dir))
499 mode = os.stat(state_dir)[stat.ST_MODE]
500 if not stat.S_ISDIR(mode):
502 "State directory %s is not a directory\n" %(state_dir))
509 "State directory %s could not be created\n" %(state_dir))
512 if scp.has_option("general", "maildir_root"):
513 maildir_root = scp.get("general", "maildir_root")
516 mode = os.stat(maildir_root)[stat.ST_MODE]
517 if not stat.S_ISDIR(mode):
519 "Maildir Root %s is not a directory\n" \
524 os.mkdir(maildir_root)
526 sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
529 feeds = scp.sections()
531 feeds.remove("general")
535 for section in feeds:
536 # check if the directory exists
539 maildir = scp.get(section, "maildir")
543 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
544 maildir = os.path.join(maildir_root, maildir)
547 exists = os.stat(maildir)
548 if stat.S_ISDIR(exists[stat.ST_MODE]):
549 # check if there's a new, cur and tmp directory
551 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
553 os.mkdir(os.path.join(maildir, "cur"))
554 if not stat.S_ISDIR(mode):
555 sys.stderr.write("Broken maildir: %s\n" %(maildir))
557 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
559 os.mkdir(os.path.join(maildir, "tmp"))
560 if not stat.S_ISDIR(mode):
561 sys.stderr.write("Broken maildir: %s\n" %(maildir))
563 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
564 if not stat.S_ISDIR(mode):
565 sys.stderr.write("Broken maildir: %s\n" %(maildir))
567 os.mkdir(os.path.join(maildir, "new"))
569 sys.stderr.write("Broken maildir: %s\n" %(maildir))
574 sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
577 os.mkdir(os.path.join(maildir, "new"))
578 os.mkdir(os.path.join(maildir, "cur"))
579 os.mkdir(os.path.join(maildir, "tmp"))
582 "Couldn't create required maildir directories for %s\n" \
586 # right - we've got the directories, we've got the section, we know the
589 parse_and_deliver(maildir, section, state_dir)