4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
89 def __init__(self,textwidth=70):
92 self.textwidth = textwidth
95 HTMLParser.__init__(self)
97 def handle_starttag(self, tag, attrs):
98 tag_name = tag.lower()
99 if tag_name in self.blockleveltags:
100 # handle starting a new block - unless we're in a block element
101 # that can contain other blocks, we'll assume that we want to close
103 if tag_name == u'br':
104 self.handle_curdata()
105 self.opentags.append(tag_name)
108 if len(self.opentags) > 0:
109 self.handle_curdata()
111 self.opentags.append(tag_name)
113 self.handle_curdata()
114 self.opentags.append(tag_name)
116 def handle_startendtag(self, tag, attrs):
117 if tag.lower() == u'br':
118 self.tags.append(u'br')
119 self.handle_curdata() # just handle the data, don't do anything else
122 def handle_curdata(self):
123 if len(self.opentags) == 0:
126 if len(self.curdata) == 0:
129 tag_thats_done = self.opentags[-1]
131 if tag_thats_done in self.blockleveltags:
132 newlinerequired = self.text != u''
134 self.text = self.text + u'\n\n'
136 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
139 headingtext = self.curdata.encode("utf-8").strip()
140 headingtext = u'\n'.join( \
141 textwrap.wrap(headingtext, self.textwidth))
143 if tag_thats_done == u'h2':
145 elif tag_thats_done != u'h1':
148 if u'\n' in headingtext:
149 underline = underlinechar * self.textwidth
151 underline = underlinechar * len(headingtext)
152 self.text = self.text \
153 + headingtext.encode("utf-8") + u'\n' \
155 elif tag_thats_done == "p":
156 paragraph = self.curdata.encode("utf-8").strip()
157 self.text = self.text \
158 + u'\n'.join(textwrap.wrap(paragraph, self.textwidth))
159 elif tag_thats_done == "pre":
160 self.text = self.text + self.curdata
161 elif tag_thats_done == "blockquote":
162 quote = self.curdata.encode("utf-8").strip()
163 self.text = self.text \
165 + u'> '.join(textwrap.wrap(quote, self.textwidth - 2))
166 elif tag_thats_done == "li":
167 item = self.curdata.encode("utf-8").strip()
168 if len(self.text) > 0 and self.text[-1] != u'\n':
169 self.text = self.text + u'\n'
170 self.text = self.text \
173 textwrap.wrap(item, self.textwidth - 3))
175 elif tag_thats_done == "dt":
176 definition = self.curdata.encode("utf-8").strip()
177 if len(self.text) > 0 and self.text[-1] != u'\n':
178 self.text = self.text + u'\n\n'
179 elif len(self.text) > 0 and self.text[-2] != u'\n':
180 self.text = self.text + u'\n'
181 definition = definition + "::"
182 self.text = self.text \
184 textwrap.wrap(definition, self.textwidth - 1))
186 elif tag_thats_done == "dd":
187 definition = self.curdata.encode("utf-8").strip()
188 if len(self.text) > 0 and self.text[-1] != u'\n':
189 self.text = self.text + u'\n'
190 self.text = self.text \
193 textwrap.wrap(definition, self.textwidth - 4))
195 elif tag_thats_done in self.liststarttags:
198 # we've got no idea what this tag does, so we'll
199 # make an assumption that we're not going to know later
200 if len(self.curdata) > 0:
201 self.text = self.text \
204 textwrap.wrap(self.curdata, self.textwidth - 5))
207 if tag_thats_done in self.blockleveltags:
210 def handle_endtag(self, tag):
212 tagindex = self.opentags.index(tag)
214 # closing tag we know nothing about.
218 while tagindex < len(self.opentags) \
219 and tag in self.opentags[tagindex+1:]:
221 tagindex = self.opentags.index(tag, tagindex+1)
223 # well, we don't want to do that then
225 if tagindex != len(self.opentags) - 1:
226 # Assuming the data was for the last opened tag first
227 self.handle_curdata()
228 # Now kill the list to be a slice before this tag was opened
229 self.opentags = self.opentags[:tagindex]
231 def handle_data(self, data):
232 self.curdata = self.curdata + unicode(data, "utf-8")
234 def handle_entityref(self, name):
236 if HTML2Text.entities.has_key(name.lower()):
237 entity = HTML2Text.entities[name.lower()]
239 entity = unichr(int(name[1:]))
241 entity = "&" + name + ";"
243 self.curdata = self.curdata + unicode(entity, "utf-8")
246 self.handle_curdata()
247 if len(self.text) == 0 or self.text[-1] != u'\n':
248 self.text = self.text + u'\n'
250 if len(self.text) > 0:
251 while len(self.text) > 1 and self.text[-1] == u'\n':
252 self.text = self.text[:-1]
253 self.text = self.text + u'\n'
256 def open_url(method, url):
258 while redirectcount < 3:
259 (type, rest) = urllib.splittype(url)
260 (host, path) = urllib.splithost(rest)
261 (host, port) = urllib.splitport(host)
265 conn = httplib.HTTPConnection("%s:%s" %(host, port))
266 conn.request(method, path)
267 response = conn.getresponse()
268 if response.status in [301, 302, 303, 307]:
269 headers = response.getheaders()
270 for header in headers:
271 if header[0] == "location":
273 elif response.status == 200:
277 redirectcount = redirectcount + 1
280 def parse_and_deliver(maildir, url, statedir):
283 # first check if we know about this feed already
284 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
285 if feeddb.has_key(url):
287 data = cgi.parse_qs(data)
288 response = open_url("HEAD", url)
291 headers = response.getheaders()
294 for header in headers:
295 if header[0] == "content-length":
296 if header[1] != data["content-length"][0]:
298 elif header[0] == "etag":
299 if header[1] != data["etag"][0]:
301 elif header[0] == "last-modified":
302 if header[1] != data["last-modified"][0]:
304 elif header[0] == "content-md5":
305 if header[1] != data["content-md5"][0]:
310 response = open_url("GET", url)
312 headers = response.getheaders()
313 feedhandle = response
315 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
318 return # don't need to do anything, nothings changed.
320 response = open_url("GET", url)
322 headers = response.getheaders()
323 feedhandle = response
325 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
328 fp = feedparser.parse(feedhandle)
329 db = dbm.open(os.path.join(statedir, "seen"), "c")
330 for item in fp["items"]:
331 # have we seen it before?
332 # need to work out what the content is first...
334 if item.has_key("content"):
335 content = item["content"][0]["value"]
337 content = item["summary"]
339 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
343 # check if there's a guid too - if that exists and we match the md5,
345 if item.has_key("guid"):
346 if db.has_key(url + "|" + item["guid"]):
347 data = db[url + "|" + item["guid"]]
348 data = cgi.parse_qs(data)
349 if data["contentmd5"][0] == md5sum:
352 if db.has_key(url + "|" + item["link"]):
353 data = db[url + "|" + item["link"]]
354 data = cgi.parse_qs(data)
355 if data.has_key("message-id"):
356 prevmessageid = data["message-id"][0]
357 if data["contentmd5"][0] == md5sum:
361 author = item["author"]
365 # create a basic email message
366 msg = MIMEMultipart("alternative")
368 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
372 string.ascii_letters + string.digits \
373 ) for a in range(0,6) \
374 ]) + "@" + socket.gethostname() + ">"
375 msg.add_header("Message-ID", messageid)
376 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
377 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
378 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
380 msg.add_header("References", prevmessageid)
381 createddate = datetime.datetime.now() \
382 .strftime("%a, %e %b %Y %T -0000")
384 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
385 .strftime("%a, %e %b %Y %T -0000")
388 msg.add_header("Date", createddate)
389 msg.add_header("Subject", item["title"])
390 msg.set_default_type("text/plain")
392 htmlcontent = content.encode("utf-8")
393 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
397 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
398 textparser = HTML2Text()
399 textparser.feed(content.encode("utf-8"))
400 textcontent = textparser.gettext()
401 textcontent = "%s\n\nItem URL: %s" %( \
404 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
408 # start by working out the filename we should be writting to, we do
409 # this following the normal maildir style rules
410 fname = str(os.getpid()) \
411 + "." + socket.gethostname() \
414 string.ascii_letters + string.digits \
415 ) for a in range(0,10) \
417 + datetime.datetime.now().strftime('%s')
418 fn = os.path.join(maildir, "tmp", fname)
420 fh.write(msg.as_string())
422 # now move it in to the new directory
423 newfn = os.path.join(maildir, "new", fname)
427 # now add to the database about the item
429 messageid = prevmessageid + " " + messageid
430 if item.has_key("guid") and item["guid"] != item["link"]:
431 data = urllib.urlencode(( \
432 ("message-id", messageid), \
433 ("created", createddate), \
434 ("contentmd5", md5sum) \
436 db[url + "|" + item["guid"]] = data
438 data = db[url + "|" + item["link"]]
439 data = cgi.parse_qs(data)
440 newdata = urllib.urlencode(( \
441 ("message-id", messageid), \
442 ("created", data["created"][0]), \
443 ("contentmd5", data["contentmd5"][0]) \
445 db[url + "|" + item["link"]] = newdata
447 db[url + "|" + item["link"]] = data
449 data = urllib.urlencode(( \
450 ("message-id", messageid), \
451 ("created", createddate), \
452 ("contentmd5", md5sum) \
454 db[url + "|" + item["link"]] = data
458 for header in headers:
459 if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
460 data.append((header[0], header[1]))
462 data = urllib.urlencode(data)
468 if __name__ == "__main__":
469 # This only gets executed if we really called the program
470 # first off, parse the command line arguments
472 oparser = OptionParser()
474 "-c", "--conf", dest="conf",
475 help="location of config file"
478 "-s", "--statedir", dest="statedir",
479 help="location of directory to store state in"
482 (options, args) = oparser.parse_args()
484 # check for the configfile
488 if options.conf != None:
489 # does the file exist?
491 os.stat(options.conf)
492 configfile = options.conf
494 # should exit here as the specified file doesn't exist
496 "Config file %s does not exist. Exiting.\n" %(options.conf,))
499 # check through the default locations
501 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
502 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
505 os.stat("/etc/rss2maildir.conf")
506 configfile = "/etc/rss2maildir.conf"
508 sys.stderr.write("No config file found. Exiting.\n")
511 # Right - if we've got this far, we've got a config file, now for the hard
514 scp = SafeConfigParser()
517 maildir_root = "RSSMaildir"
520 if options.statedir != None:
521 state_dir = options.statedir
523 mode = os.stat(state_dir)[stat.ST_MODE]
524 if not stat.S_ISDIR(mode):
526 "State directory (%s) is not a directory\n" %(state_dir))
529 # try to make the directory
533 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
535 elif scp.has_option("general", "state_dir"):
536 new_state_dir = scp.get("general", "state_dir")
538 mode = os.stat(state_dir)[stat.ST_MODE]
539 if not stat.S_ISDIR(mode):
541 "State directory (%s) is not a directory\n" %(state_dir))
546 os.mkdir(new_state_dir)
547 state_dir = new_state_dir
550 "Couldn't create state directory %s\n" %(new_state_dir))
554 mode = os.stat(state_dir)[stat.ST_MODE]
555 if not stat.S_ISDIR(mode):
557 "State directory %s is not a directory\n" %(state_dir))
564 "State directory %s could not be created\n" %(state_dir))
567 if scp.has_option("general", "maildir_root"):
568 maildir_root = scp.get("general", "maildir_root")
571 mode = os.stat(maildir_root)[stat.ST_MODE]
572 if not stat.S_ISDIR(mode):
574 "Maildir Root %s is not a directory\n" \
579 os.mkdir(maildir_root)
581 sys.stderr.write("Couldn't create Maildir Root %s\n" \
585 feeds = scp.sections()
587 feeds.remove("general")
591 for section in feeds:
592 # check if the directory exists
595 maildir = scp.get(section, "maildir")
599 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
600 maildir = os.path.join(maildir_root, maildir)
603 exists = os.stat(maildir)
604 if stat.S_ISDIR(exists[stat.ST_MODE]):
605 # check if there's a new, cur and tmp directory
607 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
609 os.mkdir(os.path.join(maildir, "cur"))
610 if not stat.S_ISDIR(mode):
611 sys.stderr.write("Broken maildir: %s\n" %(maildir))
613 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
615 os.mkdir(os.path.join(maildir, "tmp"))
616 if not stat.S_ISDIR(mode):
617 sys.stderr.write("Broken maildir: %s\n" %(maildir))
619 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
620 if not stat.S_ISDIR(mode):
621 sys.stderr.write("Broken maildir: %s\n" %(maildir))
623 os.mkdir(os.path.join(maildir, "new"))
625 sys.stderr.write("Broken maildir: %s\n" %(maildir))
630 sys.stderr.write("Couldn't create root maildir %s\n" \
634 os.mkdir(os.path.join(maildir, "new"))
635 os.mkdir(os.path.join(maildir, "cur"))
636 os.mkdir(os.path.join(maildir, "tmp"))
639 "Couldn't create required maildir directories for %s\n" \
643 # right - we've got the directories, we've got the section, we know the
646 parse_and_deliver(maildir, section, state_dir)