4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
49 class HTML2Text(HTMLParser):
89 def __init__(self,textwidth=70):
92 self.textwidth = textwidth
95 HTMLParser.__init__(self)
97 def handle_starttag(self, tag, attrs):
98 tag_name = tag.lower()
99 if tag_name in self.blockleveltags:
100 # handle starting a new block - unless we're in a block element
101 # that can contain other blocks, we'll assume that we want to close
103 if tag_name == u'br':
104 self.handle_curdata()
105 self.opentags.append(tag_name)
108 if len(self.opentags) > 0:
109 self.handle_curdata()
111 self.opentags.append(tag_name)
113 self.handle_curdata()
114 self.opentags.append(tag_name)
116 def handle_startendtag(self, tag, attrs):
117 if tag.lower() == u'br':
118 self.tags.append(u'br')
119 self.handle_curdata() # just handle the data, don't do anything else
122 def handle_curdata(self):
123 if len(self.opentags) == 0:
126 tag_thats_done = self.opentags[-1]
128 if tag_thats_done in self.blockleveltags:
129 newlinerequired = self.text != u''
131 self.text = self.text + u'\n\n'
133 if tag_thats_done in ["h1", "h2", "h3", "h4", "h5", "h6"]:
136 headingtext = self.curdata.encode("utf-8").strip()
137 headingtext = u'\n'.join( \
138 textwrap.wrap(headingtext, self.textwidth))
140 if tag_thats_done == u'h2':
142 elif tag_thats_done != u'h1':
145 if u'\n' in headingtext:
146 underline = underlinechar * self.textwidth
148 underline = underlinechar * len(headingtext)
149 self.text = self.text \
150 + headingtext.encode("utf-8") + u'\n' \
152 elif tag_thats_done == "p":
153 paragraph = self.curdata.encode("utf-8").strip()
154 self.text = self.text \
155 + u'\n'.join(textwrap.wrap(paragraph, self.textwidth))
156 elif tag_thats_done == "pre":
157 self.text = self.text + self.curdata
158 elif tag_thats_done == "blockquote":
159 quote = self.curdata.encode("utf-8").strip()
160 self.text = self.text \
162 + u'> '.join(textwrap.wrap(quote, self.textwidth - 2))
163 elif tag_thats_done == "li":
164 item = self.curdata.encode("utf-8").strip()
165 if len(self.text) > 0 and self.text[-1] != u'\n':
166 self.text = self.text + u'\n'
167 self.text = self.text \
170 textwrap.wrap(item, self.textwidth - 3))
172 elif tag_thats_done in self.liststarttags:
175 # we've got no idea what this tag does, so we'll
176 # make an assumption that we're not going to know later
177 if len(self.curdata) > 0:
178 self.text = self.text \
181 textwrap.wrap(self.curdata, self.textwidth - 5))
184 if tag_thats_done in self.blockleveltags:
187 def handle_endtag(self, tag):
189 tagindex = self.opentags.index(tag)
191 # closing tag we know nothing about.
195 while tagindex < len(self.opentags) \
196 and tag in self.opentags[tagindex+1:]:
198 tagindex = self.opentags.index(tag, tagindex+1)
200 # well, we don't want to do that then
202 if tagindex != len(self.opentags) - 1:
203 # Assuming the data was for the last opened tag first
204 self.handle_curdata()
205 # Now kill the list to be a slice before this tag was opened
206 self.opentags = self.opentags[:tagindex]
208 def handle_data(self, data):
209 self.curdata = self.curdata + unicode(data, "utf-8")
211 def handle_entityref(self, name):
213 if HTML2Text.entities.has_key(name.lower()):
214 entity = HTML2Text.entities[name.lower()]
216 entity = unichr(int(name[1:]))
218 entity = "&" + name + ";"
220 self.curdata = self.curdata + unicode(entity, "utf-8")
223 self.handle_curdata()
224 if len(self.text) == 0 or self.text[-1] != u'\n':
225 self.text = self.text + u'\n'
229 def open_url(method, url):
231 while redirectcount < 3:
232 (type, rest) = urllib.splittype(url)
233 (host, path) = urllib.splithost(rest)
234 (host, port) = urllib.splitport(host)
238 conn = httplib.HTTPConnection("%s:%s" %(host, port))
239 conn.request(method, path)
240 response = conn.getresponse()
241 if response.status in [301, 302, 303, 307]:
242 headers = response.getheaders()
243 for header in headers:
244 if header[0] == "location":
246 elif response.status == 200:
250 redirectcount = redirectcount + 1
253 def parse_and_deliver(maildir, url, statedir):
256 # first check if we know about this feed already
257 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
258 if feeddb.has_key(url):
260 data = cgi.parse_qs(data)
261 response = open_url("HEAD", url)
264 headers = response.getheaders()
267 for header in headers:
268 if header[0] == "content-length":
269 if header[1] != data["content-length"][0]:
271 elif header[0] == "etag":
272 if header[1] != data["etag"][0]:
274 elif header[0] == "last-modified":
275 if header[1] != data["last-modified"][0]:
277 elif header[0] == "content-md5":
278 if header[1] != data["content-md5"][0]:
283 response = open_url("GET", url)
285 headers = response.getheaders()
286 feedhandle = response
288 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
291 return # don't need to do anything, nothings changed.
293 response = open_url("GET", url)
295 headers = response.getheaders()
296 feedhandle = response
298 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
301 fp = feedparser.parse(feedhandle)
302 db = dbm.open(os.path.join(statedir, "seen"), "c")
303 for item in fp["items"]:
304 # have we seen it before?
305 # need to work out what the content is first...
307 if item.has_key("content"):
308 content = item["content"][0]["value"]
310 content = item["summary"]
312 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
316 # check if there's a guid too - if that exists and we match the md5,
318 if item.has_key("guid"):
319 if db.has_key(url + "|" + item["guid"]):
320 data = db[url + "|" + item["guid"]]
321 data = cgi.parse_qs(data)
322 if data["contentmd5"][0] == md5sum:
325 if db.has_key(url + "|" + item["link"]):
326 data = db[url + "|" + item["link"]]
327 data = cgi.parse_qs(data)
328 if data.has_key("message-id"):
329 prevmessageid = data["message-id"][0]
330 if data["contentmd5"][0] == md5sum:
334 author = item["author"]
338 # create a basic email message
339 msg = MIMEMultipart("alternative")
341 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
345 string.ascii_letters + string.digits \
346 ) for a in range(0,6) \
347 ]) + "@" + socket.gethostname() + ">"
348 msg.add_header("Message-ID", messageid)
349 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
350 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
351 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
353 msg.add_header("References", prevmessageid)
354 createddate = datetime.datetime.now() \
355 .strftime("%a, %e %b %Y %T -0000")
357 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
358 .strftime("%a, %e %b %Y %T -0000")
361 msg.add_header("Date", createddate)
362 msg.add_header("Subject", item["title"])
363 msg.set_default_type("text/plain")
365 htmlcontent = content.encode("utf-8")
366 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
370 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
371 textparser = HTML2Text()
372 textparser.feed(content.encode("utf-8"))
373 textcontent = textparser.gettext()
374 textcontent = "%s\n\nItem URL: %s" %( \
377 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
381 # start by working out the filename we should be writting to, we do
382 # this following the normal maildir style rules
383 fname = str(os.getpid()) \
384 + "." + socket.gethostname() \
387 string.ascii_letters + string.digits \
388 ) for a in range(0,10) \
390 + datetime.datetime.now().strftime('%s')
391 fn = os.path.join(maildir, "tmp", fname)
393 fh.write(msg.as_string())
395 # now move it in to the new directory
396 newfn = os.path.join(maildir, "new", fname)
400 # now add to the database about the item
402 messageid = prevmessageid + " " + messageid
403 if item.has_key("guid") and item["guid"] != item["link"]:
404 data = urllib.urlencode(( \
405 ("message-id", messageid), \
406 ("created", createddate), \
407 ("contentmd5", md5sum) \
409 db[url + "|" + item["guid"]] = data
411 data = db[url + "|" + item["link"]]
412 data = cgi.parse_qs(data)
413 newdata = urllib.urlencode(( \
414 ("message-id", messageid), \
415 ("created", data["created"][0]), \
416 ("contentmd5", data["contentmd5"][0]) \
418 db[url + "|" + item["link"]] = newdata
420 db[url + "|" + item["link"]] = data
422 data = urllib.urlencode(( \
423 ("message-id", messageid), \
424 ("created", createddate), \
425 ("contentmd5", md5sum) \
427 db[url + "|" + item["link"]] = data
431 for header in headers:
432 if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
433 data.append((header[0], header[1]))
435 data = urllib.urlencode(data)
441 if __name__ == "__main__":
442 # This only gets executed if we really called the program
443 # first off, parse the command line arguments
445 oparser = OptionParser()
447 "-c", "--conf", dest="conf",
448 help="location of config file"
451 "-s", "--statedir", dest="statedir",
452 help="location of directory to store state in"
455 (options, args) = oparser.parse_args()
457 # check for the configfile
461 if options.conf != None:
462 # does the file exist?
464 os.stat(options.conf)
465 configfile = options.conf
467 # should exit here as the specified file doesn't exist
469 "Config file %s does not exist. Exiting.\n" %(options.conf,))
472 # check through the default locations
474 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
475 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
478 os.stat("/etc/rss2maildir.conf")
479 configfile = "/etc/rss2maildir.conf"
481 sys.stderr.write("No config file found. Exiting.\n")
484 # Right - if we've got this far, we've got a config file, now for the hard
487 scp = SafeConfigParser()
490 maildir_root = "RSSMaildir"
493 if options.statedir != None:
494 state_dir = options.statedir
496 mode = os.stat(state_dir)[stat.ST_MODE]
497 if not stat.S_ISDIR(mode):
499 "State directory (%s) is not a directory\n" %(state_dir))
502 # try to make the directory
506 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
508 elif scp.has_option("general", "state_dir"):
509 new_state_dir = scp.get("general", "state_dir")
511 mode = os.stat(state_dir)[stat.ST_MODE]
512 if not stat.S_ISDIR(mode):
514 "State directory (%s) is not a directory\n" %(state_dir))
519 os.mkdir(new_state_dir)
520 state_dir = new_state_dir
523 "Couldn't create state directory %s\n" %(new_state_dir))
527 mode = os.stat(state_dir)[stat.ST_MODE]
528 if not stat.S_ISDIR(mode):
530 "State directory %s is not a directory\n" %(state_dir))
537 "State directory %s could not be created\n" %(state_dir))
540 if scp.has_option("general", "maildir_root"):
541 maildir_root = scp.get("general", "maildir_root")
544 mode = os.stat(maildir_root)[stat.ST_MODE]
545 if not stat.S_ISDIR(mode):
547 "Maildir Root %s is not a directory\n" \
552 os.mkdir(maildir_root)
554 sys.stderr.write("Couldn't create Maildir Root %s\n" \
558 feeds = scp.sections()
560 feeds.remove("general")
564 for section in feeds:
565 # check if the directory exists
568 maildir = scp.get(section, "maildir")
572 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
573 maildir = os.path.join(maildir_root, maildir)
576 exists = os.stat(maildir)
577 if stat.S_ISDIR(exists[stat.ST_MODE]):
578 # check if there's a new, cur and tmp directory
580 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
582 os.mkdir(os.path.join(maildir, "cur"))
583 if not stat.S_ISDIR(mode):
584 sys.stderr.write("Broken maildir: %s\n" %(maildir))
586 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
588 os.mkdir(os.path.join(maildir, "tmp"))
589 if not stat.S_ISDIR(mode):
590 sys.stderr.write("Broken maildir: %s\n" %(maildir))
592 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
593 if not stat.S_ISDIR(mode):
594 sys.stderr.write("Broken maildir: %s\n" %(maildir))
596 os.mkdir(os.path.join(maildir, "new"))
598 sys.stderr.write("Broken maildir: %s\n" %(maildir))
603 sys.stderr.write("Couldn't create root maildir %s\n" \
607 os.mkdir(os.path.join(maildir, "new"))
608 os.mkdir(os.path.join(maildir, "cur"))
609 os.mkdir(os.path.join(maildir, "tmp"))
612 "Couldn't create required maildir directories for %s\n" \
616 # right - we've got the directories, we've got the section, we know the
619 parse_and_deliver(maildir, section, state_dir)