4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
27 from email.MIMEMultipart import MIMEMultipart
28 from email.MIMEText import MIMEText
37 from optparse import OptionParser
38 from ConfigParser import SafeConfigParser
40 from base64 import b64encode
46 from HTMLParser import HTMLParser
59 class HTML2Text(HTMLParser):
62 self.inheadingone = False
63 self.inheadingtwo = False
64 self.inotherheading = False
65 self.inparagraph = True
66 self.inblockquote = False
69 self.currentparagraph = u''
70 self.headingtext = u''
76 HTMLParser.__init__(self)
78 def handle_starttag(self, tag, attrs):
79 if tag.lower() == "h1":
80 self.inheadingone = True
81 self.inparagraph = False
82 elif tag.lower() == "h2":
83 self.inheadingtwo = True
84 self.inparagraph = False
85 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86 self.inotherheading = True
87 self.inparagraph = False
88 elif tag.lower() == "a":
90 elif tag.lower() == "br":
92 self.text = self.text \
94 textwrap.wrap(self.currentparagraph, 70)) \
96 self.currentparagraph = ""
97 elif self.inblockquote:
98 self.text = self.text \
102 for a in textwrap.wrap(self.blockquote, 68) \
105 self.blockquote = u''
107 self.text = self.text + u'\n'
108 elif tag.lower() == "blockquote":
109 self.inblockquote = True
110 self.text = self.text + u'\n'
111 elif tag.lower() == "p":
113 self.text = self.text + u'\n\n'
115 self.text = self.text \
116 + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
117 self.currentparagraph = u''
118 self.inparagraph = True
119 elif tag.lower() == "pre":
120 self.text = self.text + "\n"
122 self.inparagraph = False
123 self.inblockquote = False
124 elif tag.lower() == "ul":
127 self.text = self.text + "\n"
128 elif tag.lower() == "li" and self.inul:
133 self.text = self.text \
135 + u'\n '.join([a.strip() for a in \
136 textwrap.wrap(self.item, 67)]) \
140 def handle_startendtag(self, tag, attrs):
141 if tag.lower() == "br":
143 self.text = self.text \
146 for a in textwrap.wrap( \
147 self.currentparagraph, 70) \
151 self.currentparagraph = u''
152 elif self.inblockquote:
153 self.text = self.text \
157 for a in textwrap.wrap( \
158 self.blockquote.encode("utf-8") \
163 self.blockquote = u''
165 self.text = self.text + "\n"
167 def handle_endtag(self, tag):
168 if tag.lower() == "h1":
169 self.inheadingone = False
170 self.text = self.text \
172 + self.headingtext.encode("utf-8") \
174 + u'=' * len(self.headingtext.encode("utf-8").strip())
175 self.headingtext = u''
176 elif tag.lower() == "h2":
177 self.inheadingtwo = False
178 self.text = self.text \
180 + self.headingtext.encode("utf-8") \
182 + u'-' * len(self.headingtext.encode("utf-8").strip())
183 self.headingtext = u''
184 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
185 self.inotherheading = False
186 self.text = self.text \
188 + self.headingtext.encode("utf-8") \
190 + u'~' * len(self.headingtext.encode("utf-8").strip())
191 self.headingtext = u''
192 elif tag.lower() == "p":
193 self.text = self.text \
194 + u'\n'.join(textwrap.wrap( \
195 self.currentparagraph, 70) \
197 self.inparagraph = False
198 self.currentparagraph = u''
199 elif tag.lower() == "blockquote":
200 self.text = self.text \
204 for a in textwrap.wrap( \
205 self.blockquote, 68)] \
208 self.inblockquote = False
209 self.blockquote = u''
210 elif tag.lower() == "pre":
212 elif tag.lower() == "li":
215 self.text = self.text \
218 [a.strip() for a in textwrap.wrap(self.item, 67)]) \
221 elif tag.lower() == "ul":
224 def handle_data(self, data):
225 if self.inheadingone or self.inheadingtwo or self.inotherheading:
226 self.headingtext = self.headingtext \
227 + unicode(data, "utf-8").strip() \
229 elif self.inblockquote:
230 self.blockquote = self.blockquote \
231 + unicode(data, "utf-8").strip() \
233 elif self.inparagraph:
234 self.currentparagraph = self.currentparagraph \
235 + unicode(data, "utf-8").strip() \
237 elif self.inul and self.initem:
238 self.item = self.item + unicode(data, "utf-8")
240 self.text = self.text + unicode(data, "utf-8")
242 self.text = self.text + unicode(data, "utf-8").strip() + u' '
244 def handle_entityref(self, name):
246 if entities.has_key(name.lower()):
247 entity = entities[name.lower()]
249 entity = unichr(int(name[1:]))
251 entity = "&" + name + ";"
254 self.currentparagraph = self.currentparagraph \
255 + unicode(entity, "utf-8")
256 elif self.inblockquote:
257 self.blockquote = self.blockquote + unicode(entity, "utf-8")
259 self.text = self.text + unicode(entity, "utf-8")
264 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
267 def parse_and_deliver(maildir, url, statedir):
268 fp = feedparser.parse(url)
269 db = dbm.open(os.path.join(statedir, "seen"), "c")
270 for item in fp["items"]:
271 # have we seen it before?
272 # need to work out what the content is first...
274 if item.has_key("content"):
275 content = item["content"][0]["value"]
277 content = item["summary"]
279 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
281 if db.has_key(url + "|" + item["link"]):
282 data = db[url + "|" + item["link"]]
283 data = cgi.parse_qs(data)
284 if data["contentmd5"][0] == md5sum:
288 author = item["author"]
292 # create a basic email message
293 msg = MIMEMultipart("alternative")
295 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
299 string.ascii_letters + string.digits \
300 ) for a in range(0,6) \
301 ]) + "@" + socket.gethostname() + ">"
302 msg.add_header("Message-ID", messageid)
303 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
304 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
305 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
306 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
307 .strftime("%a, %e %b %Y %T -0000")
308 msg.add_header("Date", createddate)
309 msg.add_header("Subject", item["title"])
310 msg.set_default_type("text/plain")
312 htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
313 textparser = HTML2Text()
314 textparser.feed(content.encode("utf-8"))
315 textcontent = textparser.gettext()
316 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
320 # start by working out the filename we should be writting to, we do
321 # this following the normal maildir style rules
322 fname = str(os.getpid()) \
323 + "." + socket.gethostname() \
326 string.ascii_letters + string.digits \
327 ) for a in range(0,10) \
329 + datetime.datetime.now().strftime('%s')
330 fn = os.path.join(maildir, "tmp", fname)
332 fh.write(msg.as_string())
334 # now move it in to the new directory
335 newfn = os.path.join(maildir, "new", fname)
339 # now add to the database about the item
340 data = urllib.urlencode((
341 ("message-id", messageid), \
342 ("created", createddate), \
343 ("contentmd5", md5sum) \
345 db[url + "|" + item["link"]] = data
349 # first off, parse the command line arguments
351 oparser = OptionParser()
353 "-c", "--conf", dest="conf",
354 help="location of config file"
357 "-s", "--statedir", dest="statedir",
358 help="location of directory to store state in"
361 (options, args) = oparser.parse_args()
363 # check for the configfile
367 if options.conf != None:
368 # does the file exist?
370 os.stat(options.conf)
371 configfile = options.conf
373 # should exit here as the specified file doesn't exist
375 "Config file %s does not exist. Exiting.\n" %(options.conf,))
378 # check through the default locations
380 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
381 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
384 os.stat("/etc/rss2maildir.conf")
385 configfile = "/etc/rss2maildir.conf"
387 sys.stderr.write("No config file found. Exiting.\n")
390 # Right - if we've got this far, we've got a config file, now for the hard
393 scp = SafeConfigParser()
396 maildir_root = "RSSMaildir"
399 if options.statedir != None:
400 state_dir = options.statedir
402 mode = os.stat(state_dir)[stat.ST_MODE]
403 if not stat.S_ISDIR(mode):
405 "State directory (%s) is not a directory\n" %(state_dir))
408 # try to make the directory
412 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
414 elif scp.has_option("general", "state_dir"):
415 new_state_dir = scp.get("general", "state_dir")
417 mode = os.stat(state_dir)[stat.ST_MODE]
418 if not stat.S_ISDIR(mode):
420 "State directory (%s) is not a directory\n" %(state_dir))
425 os.mkdir(new_state_dir)
426 state_dir = new_state_dir
429 "Couldn't create state directory %s\n" %(new_state_dir))
433 mode = os.stat(state_dir)[stat.ST_MODE]
434 if not stat.S_ISDIR(mode):
436 "State directory %s is not a directory\n" %(state_dir))
443 "State directory %s could not be created\n" %(state_dir))
446 if scp.has_option("general", "maildir_root"):
447 maildir_root = scp.get("general", "maildir_root")
450 mode = os.stat(maildir_root)[stat.ST_MODE]
451 if not stat.S_ISDIR(mode):
453 "Maildir Root %s is not a directory\n" \
458 os.mkdir(maildir_root)
460 sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
463 feeds = scp.sections()
465 feeds.remove("general")
469 for section in feeds:
470 # check if the directory exists
473 maildir = scp.get(section, "maildir")
477 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
478 maildir = os.path.join(maildir_root, maildir)
481 exists = os.stat(maildir)
482 if stat.S_ISDIR(exists[stat.ST_MODE]):
483 # check if there's a new, cur and tmp directory
485 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
487 os.mkdir(os.path.join(maildir, "cur"))
488 if not stat.S_ISDIR(mode):
489 sys.stderr.write("Broken maildir: %s\n" %(maildir))
491 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
493 os.mkdir(os.path.join(maildir, "tmp"))
494 if not stat.S_ISDIR(mode):
495 sys.stderr.write("Broken maildir: %s\n" %(maildir))
497 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
498 if not stat.S_ISDIR(mode):
499 sys.stderr.write("Broken maildir: %s\n" %(maildir))
501 os.mkdir(os.path.join(maildir, "new"))
503 sys.stderr.write("Broken maildir: %s\n" %(maildir))
508 sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
511 os.mkdir(os.path.join(maildir, "new"))
512 os.mkdir(os.path.join(maildir, "cur"))
513 os.mkdir(os.path.join(maildir, "tmp"))
516 "Couldn't create required maildir directories for %s\n" \
520 # right - we've got the directories, we've got the section, we know the
523 parse_and_deliver(maildir, section, state_dir)