4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
27 from email.MIMEMultipart import MIMEMultipart
28 from email.MIMEText import MIMEText
37 from optparse import OptionParser
38 from ConfigParser import SafeConfigParser
40 from base64 import b64encode
46 from HTMLParser import HTMLParser
59 class HTML2Text(HTMLParser):
62 self.inheadingone = False
63 self.inheadingtwo = False
64 self.inotherheading = False
65 self.inparagraph = True
66 self.inblockquote = False
69 self.currentparagraph = u''
70 self.headingtext = u''
76 HTMLParser.__init__(self)
78 def handle_starttag(self, tag, attrs):
79 if tag.lower() == "h1":
80 self.inheadingone = True
81 self.inparagraph = False
82 elif tag.lower() == "h2":
83 self.inheadingtwo = True
84 self.inparagraph = False
85 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86 self.inotherheading = True
87 self.inparagraph = False
88 elif tag.lower() == "a":
90 elif tag.lower() == "br":
92 self.text = self.text \
94 textwrap.wrap(self.currentparagraph, 70)) \
96 self.currentparagraph = ""
97 elif self.inblockquote:
98 self.text = self.text \
102 for a in textwrap.wrap(self.blockquote, 68) \
105 self.blockquote = u''
107 self.text = self.text + u'\n'
108 elif tag.lower() == "blockquote":
109 self.inblockquote = True
110 self.text = self.text + u'\n'
111 elif tag.lower() == "p":
113 self.text = self.text + u'\n\n'
115 self.text = self.text \
116 + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
117 self.currentparagraph = u''
118 self.inparagraph = True
119 elif tag.lower() == "pre":
120 self.text = self.text + "\n"
122 self.inparagraph = False
123 self.inblockquote = False
124 elif tag.lower() == "ul":
127 self.text = self.text + "\n"
128 elif tag.lower() == "li" and self.inul:
133 self.text = self.text \
135 + u'\n '.join([a.strip() for a in \
136 textwrap.wrap(self.item, 67)]) \
140 def handle_startendtag(self, tag, attrs):
141 if tag.lower() == "br":
143 self.text = self.text \
146 for a in textwrap.wrap( \
147 self.currentparagraph, 70) \
151 self.currentparagraph = u''
152 elif self.inblockquote:
153 self.text = self.text \
157 for a in textwrap.wrap( \
158 self.blockquote.encode("utf-8") \
163 self.blockquote = u''
165 self.text = self.text + "\n"
167 def handle_endtag(self, tag):
168 if tag.lower() == "h1":
169 self.inheadingone = False
170 self.text = self.text \
172 + self.headingtext.encode("utf-8") \
174 + u'=' * len(self.headingtext.encode("utf-8").strip())
175 self.headingtext = u''
176 elif tag.lower() == "h2":
177 self.inheadingtwo = False
178 self.text = self.text \
180 + self.headingtext.encode("utf-8") \
182 + u'-' * len(self.headingtext.encode("utf-8").strip())
183 self.headingtext = u''
184 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
185 self.inotherheading = False
186 self.text = self.text \
188 + self.headingtext.encode("utf-8") \
190 + u'~' * len(self.headingtext.encode("utf-8").strip())
191 self.headingtext = u''
192 elif tag.lower() == "p":
193 self.text = self.text \
194 + u'\n'.join(textwrap.wrap( \
195 self.currentparagraph, 70) \
197 self.inparagraph = False
198 self.currentparagraph = u''
199 elif tag.lower() == "blockquote":
200 self.text = self.text \
204 for a in textwrap.wrap( \
205 self.blockquote, 68)] \
208 self.inblockquote = False
209 self.blockquote = u''
210 elif tag.lower() == "pre":
212 elif tag.lower() == "li":
215 self.text = self.text \
218 [a.strip() for a in textwrap.wrap(self.item, 67)]) \
221 elif tag.lower() == "ul":
224 def handle_data(self, data):
225 if self.inheadingone or self.inheadingtwo or self.inotherheading:
226 self.headingtext = self.headingtext \
227 + unicode(data, "utf-8").strip() \
229 elif self.inblockquote:
230 self.blockquote = self.blockquote \
231 + unicode(data, "utf-8").strip() \
233 elif self.inparagraph:
234 self.currentparagraph = self.currentparagraph \
235 + unicode(data, "utf-8").strip() \
237 elif self.inul and self.initem:
238 self.item = self.item + unicode(data, "utf-8")
240 self.text = self.text + unicode(data, "utf-8")
242 self.text = self.text + unicode(data, "utf-8").strip() + u' '
244 def handle_entityref(self, name):
246 if entities.has_key(name.lower()):
247 entity = entities[name.lower()]
249 entity = unichr(int(name[1:]))
251 entity = "&" + name + ";"
254 self.currentparagraph = self.currentparagraph \
255 + unicode(entity, "utf-8")
256 elif self.inblockquote:
257 self.blockquote = self.blockquote + unicode(entity, "utf-8")
259 self.text = self.text + unicode(entity, "utf-8")
264 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
267 def parse_and_deliver(maildir, url, statedir):
268 fp = feedparser.parse(url)
269 db = dbm.open(os.path.join(statedir, "seen"), "c")
270 for item in fp["items"]:
271 # have we seen it before?
272 # need to work out what the content is first...
274 if item.has_key("content"):
275 content = item["content"][0]["value"]
277 content = item["summary"]
279 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
283 if db.has_key(url + "|" + item["link"]):
284 data = db[url + "|" + item["link"]]
285 data = cgi.parse_qs(data)
286 if data.has_key("message-id"):
287 prevmessageid = data["message-id"][0]
288 if data["contentmd5"][0] == md5sum:
292 author = item["author"]
296 # create a basic email message
297 msg = MIMEMultipart("alternative")
299 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
303 string.ascii_letters + string.digits \
304 ) for a in range(0,6) \
305 ]) + "@" + socket.gethostname() + ">"
306 msg.add_header("Message-ID", messageid)
307 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
308 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
309 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
311 msg.add_header("References", prevmessageid)
312 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
313 .strftime("%a, %e %b %Y %T -0000")
314 msg.add_header("Date", createddate)
315 msg.add_header("Subject", item["title"])
316 msg.set_default_type("text/plain")
318 htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
319 textparser = HTML2Text()
320 textparser.feed(content.encode("utf-8"))
321 textcontent = textparser.gettext()
322 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
326 # start by working out the filename we should be writting to, we do
327 # this following the normal maildir style rules
328 fname = str(os.getpid()) \
329 + "." + socket.gethostname() \
332 string.ascii_letters + string.digits \
333 ) for a in range(0,10) \
335 + datetime.datetime.now().strftime('%s')
336 fn = os.path.join(maildir, "tmp", fname)
338 fh.write(msg.as_string())
340 # now move it in to the new directory
341 newfn = os.path.join(maildir, "new", fname)
345 # now add to the database about the item
347 messageid = prevmessageid + " " + messageid
348 data = urllib.urlencode((
349 ("message-id", messageid), \
350 ("created", createddate), \
351 ("contentmd5", md5sum) \
353 db[url + "|" + item["link"]] = data
357 # first off, parse the command line arguments
359 oparser = OptionParser()
361 "-c", "--conf", dest="conf",
362 help="location of config file"
365 "-s", "--statedir", dest="statedir",
366 help="location of directory to store state in"
369 (options, args) = oparser.parse_args()
371 # check for the configfile
375 if options.conf != None:
376 # does the file exist?
378 os.stat(options.conf)
379 configfile = options.conf
381 # should exit here as the specified file doesn't exist
383 "Config file %s does not exist. Exiting.\n" %(options.conf,))
386 # check through the default locations
388 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
389 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
392 os.stat("/etc/rss2maildir.conf")
393 configfile = "/etc/rss2maildir.conf"
395 sys.stderr.write("No config file found. Exiting.\n")
398 # Right - if we've got this far, we've got a config file, now for the hard
401 scp = SafeConfigParser()
404 maildir_root = "RSSMaildir"
407 if options.statedir != None:
408 state_dir = options.statedir
410 mode = os.stat(state_dir)[stat.ST_MODE]
411 if not stat.S_ISDIR(mode):
413 "State directory (%s) is not a directory\n" %(state_dir))
416 # try to make the directory
420 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
422 elif scp.has_option("general", "state_dir"):
423 new_state_dir = scp.get("general", "state_dir")
425 mode = os.stat(state_dir)[stat.ST_MODE]
426 if not stat.S_ISDIR(mode):
428 "State directory (%s) is not a directory\n" %(state_dir))
433 os.mkdir(new_state_dir)
434 state_dir = new_state_dir
437 "Couldn't create state directory %s\n" %(new_state_dir))
441 mode = os.stat(state_dir)[stat.ST_MODE]
442 if not stat.S_ISDIR(mode):
444 "State directory %s is not a directory\n" %(state_dir))
451 "State directory %s could not be created\n" %(state_dir))
454 if scp.has_option("general", "maildir_root"):
455 maildir_root = scp.get("general", "maildir_root")
458 mode = os.stat(maildir_root)[stat.ST_MODE]
459 if not stat.S_ISDIR(mode):
461 "Maildir Root %s is not a directory\n" \
466 os.mkdir(maildir_root)
468 sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
471 feeds = scp.sections()
473 feeds.remove("general")
477 for section in feeds:
478 # check if the directory exists
481 maildir = scp.get(section, "maildir")
485 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
486 maildir = os.path.join(maildir_root, maildir)
489 exists = os.stat(maildir)
490 if stat.S_ISDIR(exists[stat.ST_MODE]):
491 # check if there's a new, cur and tmp directory
493 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
495 os.mkdir(os.path.join(maildir, "cur"))
496 if not stat.S_ISDIR(mode):
497 sys.stderr.write("Broken maildir: %s\n" %(maildir))
499 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
501 os.mkdir(os.path.join(maildir, "tmp"))
502 if not stat.S_ISDIR(mode):
503 sys.stderr.write("Broken maildir: %s\n" %(maildir))
505 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
506 if not stat.S_ISDIR(mode):
507 sys.stderr.write("Broken maildir: %s\n" %(maildir))
509 os.mkdir(os.path.join(maildir, "new"))
511 sys.stderr.write("Broken maildir: %s\n" %(maildir))
516 sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
519 os.mkdir(os.path.join(maildir, "new"))
520 os.mkdir(os.path.join(maildir, "cur"))
521 os.mkdir(os.path.join(maildir, "tmp"))
524 "Couldn't create required maildir directories for %s\n" \
528 # right - we've got the directories, we've got the section, we know the
531 parse_and_deliver(maildir, section, state_dir)