4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
27 from email.MIMEMultipart import MIMEMultipart
28 from email.MIMEText import MIMEText
37 from optparse import OptionParser
38 from ConfigParser import SafeConfigParser
40 from base64 import b64encode
46 from HTMLParser import HTMLParser
59 class HTML2Text(HTMLParser):
62 self.inheadingone = False
63 self.inheadingtwo = False
64 self.inotherheading = False
65 self.inparagraph = True
66 self.inblockquote = False
69 self.currentparagraph = u''
70 self.headingtext = u''
76 HTMLParser.__init__(self)
78 def handle_starttag(self, tag, attrs):
79 if tag.lower() == "h1":
80 self.inheadingone = True
81 self.inparagraph = False
82 elif tag.lower() == "h2":
83 self.inheadingtwo = True
84 self.inparagraph = False
85 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86 self.inotherheading = True
87 self.inparagraph = False
88 elif tag.lower() == "a":
90 elif tag.lower() == "br":
92 self.text = self.text \
94 textwrap.wrap(self.currentparagraph, 70)) \
96 self.currentparagraph = ""
97 elif self.inblockquote:
98 self.text = self.text \
102 for a in textwrap.wrap(self.blockquote, 68) \
105 self.blockquote = u''
107 self.text = self.text + u'\n'
108 elif tag.lower() == "blockquote":
109 self.inblockquote = True
110 self.text = self.text + u'\n'
111 elif tag.lower() == "p":
113 self.text = self.text + u'\n\n'
115 self.text = self.text \
116 + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
117 self.currentparagraph = u''
118 self.inparagraph = True
119 elif tag.lower() == "pre":
120 self.text = self.text + "\n"
122 self.inparagraph = False
123 self.inblockquote = False
124 elif tag.lower() == "ul":
127 self.text = self.text + "\n"
128 elif tag.lower() == "li" and self.inul:
133 self.text = self.text \
135 + u'\n '.join([a.strip() for a in textwrap.wrap(self.item, 67)]) \
139 def handle_startendtag(self, tag, attrs):
140 if tag.lower() == "br":
142 self.text = self.text \
145 for a in textwrap.wrap( \
146 self.currentparagraph, 70) \
150 self.currentparagraph = u''
151 elif self.inblockquote:
152 self.text = self.text \
156 for a in textwrap.wrap( \
157 self.blockquote.encode("utf-8") \
162 self.blockquote = u''
164 self.text = self.text + "\n"
166 def handle_endtag(self, tag):
167 if tag.lower() == "h1":
168 self.inheadingone = False
169 self.text = self.text \
171 + self.headingtext.encode("utf-8") \
173 + u'=' * len(self.headingtext.encode("utf-8").strip())
174 self.headingtext = u''
175 elif tag.lower() == "h2":
176 self.inheadingtwo = False
177 self.text = self.text \
179 + self.headingtext.encode("utf-8") \
181 + u'-' * len(self.headingtext.encode("utf-8").strip())
182 self.headingtext = u''
183 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
184 self.inotherheading = False
185 self.text = self.text \
187 + self.headingtext.encode("utf-8") \
189 + u'~' * len(self.headingtext.encode("utf-8").strip())
190 self.headingtext = u''
191 elif tag.lower() == "p":
192 self.text = self.text \
193 + u'\n'.join(textwrap.wrap( \
194 self.currentparagraph, 70) \
196 self.inparagraph = False
197 self.currentparagraph = u''
198 elif tag.lower() == "blockquote":
199 self.text = self.text \
202 [a.strip() for a in textwrap.wrap(self.blockquote, 68)] \
205 self.inblockquote = False
206 self.blockquote = u''
207 elif tag.lower() == "pre":
209 elif tag.lower() == "li":
212 self.text = self.text \
215 [a.strip() for a in textwrap.wrap(self.item, 67)]) \
218 elif tag.lower() == "ul":
221 def handle_data(self, data):
222 if self.inheadingone or self.inheadingtwo or self.inotherheading:
223 self.headingtext = self.headingtext \
224 + unicode(data, "utf-8").strip() \
226 elif self.inblockquote:
227 self.blockquote = self.blockquote \
228 + unicode(data, "utf-8").strip() \
230 elif self.inparagraph:
231 self.currentparagraph = self.currentparagraph \
232 + unicode(data, "utf-8").strip() \
234 elif self.inul and self.initem:
235 self.item = self.item + unicode(data, "utf-8")
237 self.text = self.text + unicode(data, "utf-8")
239 self.text = self.text + unicode(data, "utf-8").strip() + u' '
241 def handle_entityref(self, name):
243 if entities.has_key(name.lower()):
244 entity = entities[name.lower()]
246 entity = unichr(int(name[1:]))
248 entity = "&" + name + ";"
251 self.currentparagraph = self.currentparagraph + entity
252 elif self.inblockquote:
253 self.blockquote = self.blockquote + entity
255 self.text = self.text + entity
260 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
263 def parse_and_deliver(maildir, url, statedir):
264 fp = feedparser.parse(url)
265 db = dbm.open(os.path.join(statedir, "seen"), "c")
266 for item in fp["items"]:
267 # have we seen it before?
268 # need to work out what the content is first...
270 if item.has_key("content"):
271 content = item["content"][0]["value"]
273 content = item["summary"]
275 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
277 if db.has_key(url + "|" + item["link"]):
278 data = db[url + "|" + item["link"]]
279 data = cgi.parse_qs(data)
280 if data["contentmd5"][0] == md5sum:
284 author = item["author"]
288 # create a basic email message
289 msg = MIMEMultipart("alternative")
291 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
295 string.ascii_letters + string.digits \
296 ) for a in range(0,6) \
297 ]) + "@" + socket.gethostname() + ">"
298 msg.add_header("Message-ID", messageid)
299 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
300 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
301 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
302 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
303 .strftime("%a, %e %b %Y %T -0000")
304 msg.add_header("Date", createddate)
305 msg.add_header("Subject", item["title"])
306 msg.set_default_type("text/plain")
308 htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
309 textparser = HTML2Text()
310 textparser.feed(content.encode("utf-8"))
311 textcontent = textparser.gettext()
312 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
316 # start by working out the filename we should be writting to, we do
317 # this following the normal maildir style rules
318 fname = str(os.getpid()) \
319 + "." + socket.gethostname() \
322 string.ascii_letters + string.digits \
323 ) for a in range(0,10) \
325 + datetime.datetime.now().strftime('%s')
326 fn = os.path.join(maildir, "tmp", fname)
328 fh.write(msg.as_string())
330 # now move it in to the new directory
331 newfn = os.path.join(maildir, "new", fname)
335 # now add to the database about the item
336 data = urllib.urlencode((
337 ("message-id", messageid), \
338 ("created", createddate), \
339 ("contentmd5", md5sum) \
341 db[url + "|" + item["link"]] = data
345 # first off, parse the command line arguments
347 oparser = OptionParser()
349 "-c", "--conf", dest="conf",
350 help="location of config file"
353 "-s", "--statedir", dest="statedir",
354 help="location of directory to store state in"
357 (options, args) = oparser.parse_args()
359 # check for the configfile
363 if options.conf != None:
364 # does the file exist?
366 os.stat(options.conf)
367 configfile = options.conf
369 # should exit here as the specified file doesn't exist
370 sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
373 # check through the default locations
375 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
376 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
379 os.stat("/etc/rss2maildir.conf")
380 configfile = "/etc/rss2maildir.conf"
382 sys.stderr.write("No config file found. Exiting.\n")
385 # Right - if we've got this far, we've got a config file, now for the hard
388 scp = SafeConfigParser()
391 maildir_root = "RSSMaildir"
394 if options.statedir != None:
395 state_dir = options.statedir
397 mode = os.stat(state_dir)[stat.ST_MODE]
398 if not stat.S_ISDIR(mode):
399 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
402 # try to make the directory
406 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
408 elif scp.has_option("general", "state_dir"):
409 new_state_dir = scp.get("general", "state_dir")
411 mode = os.stat(state_dir)[stat.ST_MODE]
412 if not stat.S_ISDIR(mode):
413 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
418 os.mkdir(new_state_dir)
419 state_dir = new_state_dir
421 sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
425 mode = os.stat(state_dir)[stat.ST_MODE]
426 if not stat.S_ISDIR(mode):
427 sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
433 sys.stderr.write("State directory %s could not be created\n" %(state_dir))
436 if scp.has_option("general", "maildir_root"):
437 maildir_root = scp.get("general", "maildir_root")
440 mode = os.stat(maildir_root)[stat.ST_MODE]
441 if not stat.S_ISDIR(mode):
442 sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
446 os.mkdir(maildir_root)
448 sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
451 feeds = scp.sections()
453 feeds.remove("general")
457 for section in feeds:
458 # check if the directory exists
461 maildir = scp.get(section, "maildir")
465 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
466 maildir = os.path.join(maildir_root, maildir)
469 exists = os.stat(maildir)
470 if stat.S_ISDIR(exists[stat.ST_MODE]):
471 # check if there's a new, cur and tmp directory
473 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
475 os.mkdir(os.path.join(maildir, "cur"))
476 if not stat.S_ISDIR(mode):
477 sys.stderr.write("Broken maildir: %s\n" %(maildir))
479 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
481 os.mkdir(os.path.join(maildir, "tmp"))
482 if not stat.S_ISDIR(mode):
483 sys.stderr.write("Broken maildir: %s\n" %(maildir))
485 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
486 if not stat.S_ISDIR(mode):
487 sys.stderr.write("Broken maildir: %s\n" %(maildir))
489 os.mkdir(os.path.join(maildir, "new"))
491 sys.stderr.write("Broken maildir: %s\n" %(maildir))
496 sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
499 os.mkdir(os.path.join(maildir, "new"))
500 os.mkdir(os.path.join(maildir, "cur"))
501 os.mkdir(os.path.join(maildir, "tmp"))
503 sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
506 # right - we've got the directories, we've got the section, we know the
509 parse_and_deliver(maildir, section, state_dir)