4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
27 from email.MIMEMultipart import MIMEMultipart
28 from email.MIMEText import MIMEText
37 from optparse import OptionParser
38 from ConfigParser import SafeConfigParser
40 from base64 import b64encode
46 from HTMLParser import HTMLParser
59 class HTML2Text(HTMLParser):
62 self.inheadingone = False
63 self.inheadingtwo = False
64 self.inotherheading = False
65 self.inparagraph = True
66 self.inblockquote = False
69 self.currentparagraph = u''
70 self.headingtext = u''
76 HTMLParser.__init__(self)
78 def handle_starttag(self, tag, attrs):
79 if tag.lower() == "h1":
80 self.inheadingone = True
81 self.inparagraph = False
82 elif tag.lower() == "h2":
83 self.inheadingtwo = True
84 self.inparagraph = False
85 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86 self.inotherheading = True
87 self.inparagraph = False
88 elif tag.lower() == "a":
90 elif tag.lower() == "br":
92 elif tag.lower() == "blockquote":
93 self.inblockquote = True
94 self.text = self.text + u'\n'
95 elif tag.lower() == "p":
97 self.text = self.text + u'\n\n'
99 self.text = self.text \
100 + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
101 self.currentparagraph = u''
102 self.inparagraph = True
103 elif tag.lower() == "pre":
104 self.text = self.text + "\n"
106 self.inparagraph = False
107 self.inblockquote = False
108 elif tag.lower() == "ul":
111 self.text = self.text + "\n"
112 elif tag.lower() == "li" and self.inul:
117 self.text = self.text \
119 + u'\n '.join([a.strip() for a in \
120 textwrap.wrap(self.item, 67)]) \
124 def handle_startendtag(self, tag, attrs):
125 if tag.lower() == "br":
130 self.text = self.text \
133 for a in textwrap.wrap( \
134 self.currentparagraph, 70) \
138 self.currentparagraph = u''
139 elif self.inblockquote:
140 self.text = self.text \
144 for a in textwrap.wrap( \
145 self.blockquote.encode("utf-8") \
150 self.blockquote = u''
152 self.text = self.text + "\n"
154 def handle_endtag(self, tag):
155 if tag.lower() == "h1":
156 self.inheadingone = False
157 self.text = self.text \
159 + self.headingtext.encode("utf-8") \
161 + u'=' * len(self.headingtext.encode("utf-8").strip())
162 self.headingtext = u''
163 elif tag.lower() == "h2":
164 self.inheadingtwo = False
165 self.text = self.text \
167 + self.headingtext.encode("utf-8") \
169 + u'-' * len(self.headingtext.encode("utf-8").strip())
170 self.headingtext = u''
171 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
172 self.inotherheading = False
173 self.text = self.text \
175 + self.headingtext.encode("utf-8") \
177 + u'~' * len(self.headingtext.encode("utf-8").strip())
178 self.headingtext = u''
179 elif tag.lower() == "p":
180 self.text = self.text \
181 + u'\n'.join(textwrap.wrap( \
182 self.currentparagraph, 70) \
184 self.inparagraph = False
185 self.currentparagraph = u''
186 elif tag.lower() == "blockquote":
187 self.text = self.text \
191 for a in textwrap.wrap( \
192 self.blockquote, 68)] \
195 self.inblockquote = False
196 self.blockquote = u''
197 elif tag.lower() == "pre":
199 elif tag.lower() == "li":
202 self.text = self.text \
205 [a.strip() for a in textwrap.wrap(self.item, 67)]) \
208 elif tag.lower() == "ul":
211 def handle_data(self, data):
212 if self.inheadingone or self.inheadingtwo or self.inotherheading:
213 self.headingtext = self.headingtext \
214 + unicode(data, "utf-8").strip() \
216 elif self.inblockquote:
217 self.blockquote = self.blockquote \
218 + unicode(data, "utf-8").strip() \
220 elif self.inparagraph:
221 self.currentparagraph = self.currentparagraph \
222 + unicode(data, "utf-8").strip() \
224 elif self.inul and self.initem:
225 self.item = self.item + unicode(data, "utf-8")
227 self.text = self.text + unicode(data, "utf-8")
229 self.text = self.text + unicode(data, "utf-8").strip() + u' '
231 def handle_entityref(self, name):
233 if entities.has_key(name.lower()):
234 entity = entities[name.lower()]
236 entity = unichr(int(name[1:]))
238 entity = "&" + name + ";"
241 self.currentparagraph = self.currentparagraph \
242 + unicode(entity, "utf-8")
243 elif self.inblockquote:
244 self.blockquote = self.blockquote + unicode(entity, "utf-8")
246 self.text = self.text + unicode(entity, "utf-8")
251 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
254 def parse_and_deliver(maildir, url, statedir):
255 fp = feedparser.parse(url)
256 db = dbm.open(os.path.join(statedir, "seen"), "c")
257 for item in fp["items"]:
258 # have we seen it before?
259 # need to work out what the content is first...
261 if item.has_key("content"):
262 content = item["content"][0]["value"]
264 content = item["summary"]
266 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
270 if db.has_key(url + "|" + item["link"]):
271 data = db[url + "|" + item["link"]]
272 data = cgi.parse_qs(data)
273 if data.has_key("message-id"):
274 prevmessageid = data["message-id"][0]
275 if data["contentmd5"][0] == md5sum:
279 author = item["author"]
283 # create a basic email message
284 msg = MIMEMultipart("alternative")
286 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
290 string.ascii_letters + string.digits \
291 ) for a in range(0,6) \
292 ]) + "@" + socket.gethostname() + ">"
293 msg.add_header("Message-ID", messageid)
294 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
295 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
296 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
298 msg.add_header("References", prevmessageid)
299 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
300 .strftime("%a, %e %b %Y %T -0000")
301 msg.add_header("Date", createddate)
302 msg.add_header("Subject", item["title"])
303 msg.set_default_type("text/plain")
305 htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
306 textparser = HTML2Text()
307 textparser.feed(content.encode("utf-8"))
308 textcontent = textparser.gettext()
309 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
313 # start by working out the filename we should be writting to, we do
314 # this following the normal maildir style rules
315 fname = str(os.getpid()) \
316 + "." + socket.gethostname() \
319 string.ascii_letters + string.digits \
320 ) for a in range(0,10) \
322 + datetime.datetime.now().strftime('%s')
323 fn = os.path.join(maildir, "tmp", fname)
325 fh.write(msg.as_string())
327 # now move it in to the new directory
328 newfn = os.path.join(maildir, "new", fname)
332 # now add to the database about the item
334 messageid = prevmessageid + " " + messageid
335 data = urllib.urlencode((
336 ("message-id", messageid), \
337 ("created", createddate), \
338 ("contentmd5", md5sum) \
340 db[url + "|" + item["link"]] = data
344 # first off, parse the command line arguments
346 oparser = OptionParser()
348 "-c", "--conf", dest="conf",
349 help="location of config file"
352 "-s", "--statedir", dest="statedir",
353 help="location of directory to store state in"
356 (options, args) = oparser.parse_args()
358 # check for the configfile
362 if options.conf != None:
363 # does the file exist?
365 os.stat(options.conf)
366 configfile = options.conf
368 # should exit here as the specified file doesn't exist
370 "Config file %s does not exist. Exiting.\n" %(options.conf,))
373 # check through the default locations
375 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
376 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
379 os.stat("/etc/rss2maildir.conf")
380 configfile = "/etc/rss2maildir.conf"
382 sys.stderr.write("No config file found. Exiting.\n")
385 # Right - if we've got this far, we've got a config file, now for the hard
388 scp = SafeConfigParser()
391 maildir_root = "RSSMaildir"
394 if options.statedir != None:
395 state_dir = options.statedir
397 mode = os.stat(state_dir)[stat.ST_MODE]
398 if not stat.S_ISDIR(mode):
400 "State directory (%s) is not a directory\n" %(state_dir))
403 # try to make the directory
407 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
409 elif scp.has_option("general", "state_dir"):
410 new_state_dir = scp.get("general", "state_dir")
412 mode = os.stat(state_dir)[stat.ST_MODE]
413 if not stat.S_ISDIR(mode):
415 "State directory (%s) is not a directory\n" %(state_dir))
420 os.mkdir(new_state_dir)
421 state_dir = new_state_dir
424 "Couldn't create state directory %s\n" %(new_state_dir))
428 mode = os.stat(state_dir)[stat.ST_MODE]
429 if not stat.S_ISDIR(mode):
431 "State directory %s is not a directory\n" %(state_dir))
438 "State directory %s could not be created\n" %(state_dir))
441 if scp.has_option("general", "maildir_root"):
442 maildir_root = scp.get("general", "maildir_root")
445 mode = os.stat(maildir_root)[stat.ST_MODE]
446 if not stat.S_ISDIR(mode):
448 "Maildir Root %s is not a directory\n" \
453 os.mkdir(maildir_root)
455 sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
458 feeds = scp.sections()
460 feeds.remove("general")
464 for section in feeds:
465 # check if the directory exists
468 maildir = scp.get(section, "maildir")
472 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
473 maildir = os.path.join(maildir_root, maildir)
476 exists = os.stat(maildir)
477 if stat.S_ISDIR(exists[stat.ST_MODE]):
478 # check if there's a new, cur and tmp directory
480 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
482 os.mkdir(os.path.join(maildir, "cur"))
483 if not stat.S_ISDIR(mode):
484 sys.stderr.write("Broken maildir: %s\n" %(maildir))
486 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
488 os.mkdir(os.path.join(maildir, "tmp"))
489 if not stat.S_ISDIR(mode):
490 sys.stderr.write("Broken maildir: %s\n" %(maildir))
492 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
493 if not stat.S_ISDIR(mode):
494 sys.stderr.write("Broken maildir: %s\n" %(maildir))
496 os.mkdir(os.path.join(maildir, "new"))
498 sys.stderr.write("Broken maildir: %s\n" %(maildir))
503 sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
506 os.mkdir(os.path.join(maildir, "new"))
507 os.mkdir(os.path.join(maildir, "cur"))
508 os.mkdir(os.path.join(maildir, "tmp"))
511 "Couldn't create required maildir directories for %s\n" \
515 # right - we've got the directories, we've got the section, we know the
518 parse_and_deliver(maildir, section, state_dir)