4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
62 self.inheadingone = False
63 self.inheadingtwo = False
64 self.inotherheading = False
65 self.inparagraph = True
66 self.inblockquote = False
69 self.currentparagraph = u''
70 self.headingtext = u''
76 HTMLParser.__init__(self)
78 def handle_starttag(self, tag, attrs):
79 if tag.lower() == "h1":
80 self.inheadingone = True
81 self.inparagraph = False
82 elif tag.lower() == "h2":
83 self.inheadingtwo = True
84 self.inparagraph = False
85 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86 self.inotherheading = True
87 self.inparagraph = False
88 elif tag.lower() == "a":
90 elif tag.lower() == "br":
92 elif tag.lower() == "blockquote":
93 self.inblockquote = True
94 self.text = self.text + u'\n'
95 elif tag.lower() == "p":
97 self.text = self.text + u'\n\n'
99 self.text = self.text \
100 + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
101 self.currentparagraph = u''
102 self.inparagraph = True
103 elif tag.lower() == "pre":
104 self.text = self.text + "\n"
106 self.inparagraph = False
107 self.inblockquote = False
108 elif tag.lower() == "ul":
111 self.text = self.text + "\n"
112 elif tag.lower() == "li" and self.inul:
117 self.text = self.text \
119 + u'\n '.join([a.strip() for a in \
120 textwrap.wrap(self.item, 67)]) \
124 def handle_startendtag(self, tag, attrs):
125 if tag.lower() == "br":
130 self.text = self.text \
133 for a in textwrap.wrap( \
134 self.currentparagraph, 70) \
138 self.currentparagraph = u''
139 elif self.inblockquote:
140 self.text = self.text \
144 for a in textwrap.wrap( \
145 self.blockquote.encode("utf-8") \
150 self.blockquote = u''
152 self.text = self.text + "\n"
154 def handle_endtag(self, tag):
155 if tag.lower() == "h1":
156 self.inheadingone = False
157 self.text = self.text \
159 + self.headingtext.encode("utf-8") \
161 + u'=' * len(self.headingtext.encode("utf-8").strip())
162 self.headingtext = u''
163 elif tag.lower() == "h2":
164 self.inheadingtwo = False
165 self.text = self.text \
167 + self.headingtext.encode("utf-8") \
169 + u'-' * len(self.headingtext.encode("utf-8").strip())
170 self.headingtext = u''
171 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
172 self.inotherheading = False
173 self.text = self.text \
175 + self.headingtext.encode("utf-8") \
177 + u'~' * len(self.headingtext.encode("utf-8").strip())
178 self.headingtext = u''
179 elif tag.lower() == "p":
180 self.text = self.text \
181 + u'\n'.join(textwrap.wrap( \
182 self.currentparagraph, 70) \
184 self.inparagraph = False
185 self.currentparagraph = u''
186 elif tag.lower() == "blockquote":
187 self.text = self.text \
191 for a in textwrap.wrap( \
192 self.blockquote, 68)] \
195 self.inblockquote = False
196 self.blockquote = u''
197 elif tag.lower() == "pre":
199 elif tag.lower() == "li":
202 self.text = self.text \
205 [a.strip() for a in textwrap.wrap(self.item, 67)]) \
208 elif tag.lower() == "ul":
211 def handle_data(self, data):
212 if self.inheadingone or self.inheadingtwo or self.inotherheading:
213 self.headingtext = self.headingtext \
214 + unicode(data, "utf-8").strip() \
216 elif self.inblockquote:
217 self.blockquote = self.blockquote \
218 + unicode(data, "utf-8").strip() \
220 elif self.inparagraph:
221 self.currentparagraph = self.currentparagraph \
222 + unicode(data, "utf-8").strip() \
224 elif self.inul and self.initem:
225 self.item = self.item + unicode(data, "utf-8")
227 self.text = self.text + unicode(data, "utf-8")
229 self.text = self.text + unicode(data, "utf-8").strip() + u' '
231 def handle_entityref(self, name):
233 if HTML2Text.entities.has_key(name.lower()):
234 entity = HTML2Text.entities[name.lower()]
236 entity = unichr(int(name[1:]))
238 entity = "&" + name + ";"
241 self.currentparagraph = self.currentparagraph \
242 + unicode(entity, "utf-8")
243 elif self.inblockquote:
244 self.blockquote = self.blockquote + unicode(entity, "utf-8")
246 self.text = self.text + unicode(entity, "utf-8")
251 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
254 def open_url(method, url):
256 while redirectcount < 3:
257 (type, rest) = urllib.splittype(url)
258 (host, path) = urllib.splithost(rest)
259 (host, port) = urllib.splitport(host)
263 conn = httplib.HTTPConnection("%s:%s" %(host, port))
264 conn.request(method, path)
265 response = conn.getresponse()
266 if response.status in [301, 302, 303, 307]:
267 headers = response.getheaders()
268 for header in headers:
269 if header[0] == "location":
271 elif response.status == 200:
275 redirectcount = redirectcount + 1
278 def parse_and_deliver(maildir, url, statedir):
281 # first check if we know about this feed already
282 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
283 if feeddb.has_key(url):
285 data = cgi.parse_qs(data)
286 response = open_url("HEAD", url)
289 headers = response.getheaders()
292 for header in headers:
293 if header[0] == "content-length":
294 if header[1] != data["content-length"][0]:
296 elif header[0] == "etag":
297 if header[1] != data["etag"][0]:
299 elif header[0] == "last-modified":
300 if header[1] != data["last-modified"][0]:
302 elif header[0] == "content-md5":
303 if header[1] != data["content-md5"][0]:
308 response = open_url("GET", url)
310 headers = response.getheaders()
311 feedhandle = response
313 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
316 return # don't need to do anything, nothings changed.
318 response = open_url("GET", url)
320 headers = response.getheaders()
321 feedhandle = response
323 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
326 fp = feedparser.parse(feedhandle)
327 db = dbm.open(os.path.join(statedir, "seen"), "c")
328 for item in fp["items"]:
329 # have we seen it before?
330 # need to work out what the content is first...
332 if item.has_key("content"):
333 content = item["content"][0]["value"]
335 content = item["summary"]
337 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
341 # check if there's a guid too - if that exists and we match the md5,
343 if item.has_key("guid"):
344 if db.has_key(url + "|" + item["guid"]):
345 data = db[url + "|" + item["guid"]]
346 data = cgi.parse_qs(data)
347 if data["contentmd5"][0] == md5sum:
350 if db.has_key(url + "|" + item["link"]):
351 data = db[url + "|" + item["link"]]
352 data = cgi.parse_qs(data)
353 if data.has_key("message-id"):
354 prevmessageid = data["message-id"][0]
355 if data["contentmd5"][0] == md5sum:
359 author = item["author"]
363 # create a basic email message
364 msg = MIMEMultipart("alternative")
366 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
370 string.ascii_letters + string.digits \
371 ) for a in range(0,6) \
372 ]) + "@" + socket.gethostname() + ">"
373 msg.add_header("Message-ID", messageid)
374 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
375 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
376 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
378 msg.add_header("References", prevmessageid)
379 createddate = datetime.datetime.now() \
380 .strftime("%a, %e %b %Y %T -0000")
382 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
383 .strftime("%a, %e %b %Y %T -0000")
386 msg.add_header("Date", createddate)
387 msg.add_header("Subject", item["title"])
388 msg.set_default_type("text/plain")
390 htmlcontent = content.encode("utf-8")
391 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
395 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
396 textparser = HTML2Text()
397 textparser.feed(content.encode("utf-8"))
398 textcontent = textparser.gettext()
399 textcontent = "%s\n\nItem URL: %s" %( \
402 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
406 # start by working out the filename we should be writting to, we do
407 # this following the normal maildir style rules
408 fname = str(os.getpid()) \
409 + "." + socket.gethostname() \
412 string.ascii_letters + string.digits \
413 ) for a in range(0,10) \
415 + datetime.datetime.now().strftime('%s')
416 fn = os.path.join(maildir, "tmp", fname)
418 fh.write(msg.as_string())
420 # now move it in to the new directory
421 newfn = os.path.join(maildir, "new", fname)
425 # now add to the database about the item
427 messageid = prevmessageid + " " + messageid
428 if item.has_key("guid") and item["guid"] != item["link"]:
429 data = urllib.urlencode(( \
430 ("message-id", messageid), \
431 ("created", createddate), \
432 ("contentmd5", md5sum) \
434 db[url + "|" + item["guid"]] = data
436 data = db[url + "|" + item["link"]]
437 data = cgi.parse_qs(data)
438 newdata = urllib.urlencode(( \
439 ("message-id", messageid), \
440 ("created", data["created"][0]), \
441 ("contentmd5", data["contentmd5"][0]) \
443 db[url + "|" + item["link"]] = newdata
445 db[url + "|" + item["link"]] = data
447 data = urllib.urlencode(( \
448 ("message-id", messageid), \
449 ("created", createddate), \
450 ("contentmd5", md5sum) \
452 db[url + "|" + item["link"]] = data
456 for header in headers:
457 if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
458 data.append((header[0], header[1]))
460 data = urllib.urlencode(data)
466 if __name__ == "__main__":
467 # This only gets executed if we really called the program
468 # first off, parse the command line arguments
470 oparser = OptionParser()
472 "-c", "--conf", dest="conf",
473 help="location of config file"
476 "-s", "--statedir", dest="statedir",
477 help="location of directory to store state in"
480 (options, args) = oparser.parse_args()
482 # check for the configfile
486 if options.conf != None:
487 # does the file exist?
489 os.stat(options.conf)
490 configfile = options.conf
492 # should exit here as the specified file doesn't exist
494 "Config file %s does not exist. Exiting.\n" %(options.conf,))
497 # check through the default locations
499 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
500 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
503 os.stat("/etc/rss2maildir.conf")
504 configfile = "/etc/rss2maildir.conf"
506 sys.stderr.write("No config file found. Exiting.\n")
509 # Right - if we've got this far, we've got a config file, now for the hard
512 scp = SafeConfigParser()
515 maildir_root = "RSSMaildir"
518 if options.statedir != None:
519 state_dir = options.statedir
521 mode = os.stat(state_dir)[stat.ST_MODE]
522 if not stat.S_ISDIR(mode):
524 "State directory (%s) is not a directory\n" %(state_dir))
527 # try to make the directory
531 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
533 elif scp.has_option("general", "state_dir"):
534 new_state_dir = scp.get("general", "state_dir")
536 mode = os.stat(state_dir)[stat.ST_MODE]
537 if not stat.S_ISDIR(mode):
539 "State directory (%s) is not a directory\n" %(state_dir))
544 os.mkdir(new_state_dir)
545 state_dir = new_state_dir
548 "Couldn't create state directory %s\n" %(new_state_dir))
552 mode = os.stat(state_dir)[stat.ST_MODE]
553 if not stat.S_ISDIR(mode):
555 "State directory %s is not a directory\n" %(state_dir))
562 "State directory %s could not be created\n" %(state_dir))
565 if scp.has_option("general", "maildir_root"):
566 maildir_root = scp.get("general", "maildir_root")
569 mode = os.stat(maildir_root)[stat.ST_MODE]
570 if not stat.S_ISDIR(mode):
572 "Maildir Root %s is not a directory\n" \
577 os.mkdir(maildir_root)
579 sys.stderr.write("Couldn't create Maildir Root %s\n" \
583 feeds = scp.sections()
585 feeds.remove("general")
589 for section in feeds:
590 # check if the directory exists
593 maildir = scp.get(section, "maildir")
597 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
598 maildir = os.path.join(maildir_root, maildir)
601 exists = os.stat(maildir)
602 if stat.S_ISDIR(exists[stat.ST_MODE]):
603 # check if there's a new, cur and tmp directory
605 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
607 os.mkdir(os.path.join(maildir, "cur"))
608 if not stat.S_ISDIR(mode):
609 sys.stderr.write("Broken maildir: %s\n" %(maildir))
611 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
613 os.mkdir(os.path.join(maildir, "tmp"))
614 if not stat.S_ISDIR(mode):
615 sys.stderr.write("Broken maildir: %s\n" %(maildir))
617 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
618 if not stat.S_ISDIR(mode):
619 sys.stderr.write("Broken maildir: %s\n" %(maildir))
621 os.mkdir(os.path.join(maildir, "new"))
623 sys.stderr.write("Broken maildir: %s\n" %(maildir))
628 sys.stderr.write("Couldn't create root maildir %s\n" \
632 os.mkdir(os.path.join(maildir, "new"))
633 os.mkdir(os.path.join(maildir, "cur"))
634 os.mkdir(os.path.join(maildir, "tmp"))
637 "Couldn't create required maildir directories for %s\n" \
641 # right - we've got the directories, we've got the section, we know the
644 parse_and_deliver(maildir, section, state_dir)