4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
60 class HTML2Text(HTMLParser):
63 self.inheadingone = False
64 self.inheadingtwo = False
65 self.inotherheading = False
66 self.inparagraph = True
67 self.inblockquote = False
70 self.currentparagraph = u''
71 self.headingtext = u''
77 HTMLParser.__init__(self)
79 def handle_starttag(self, tag, attrs):
80 if tag.lower() == "h1":
81 self.inheadingone = True
82 self.inparagraph = False
83 elif tag.lower() == "h2":
84 self.inheadingtwo = True
85 self.inparagraph = False
86 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
87 self.inotherheading = True
88 self.inparagraph = False
89 elif tag.lower() == "a":
91 elif tag.lower() == "br":
93 elif tag.lower() == "blockquote":
94 self.inblockquote = True
95 self.text = self.text + u'\n'
96 elif tag.lower() == "p":
98 self.text = self.text + u'\n\n'
100 self.text = self.text \
101 + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
102 self.currentparagraph = u''
103 self.inparagraph = True
104 elif tag.lower() == "pre":
105 self.text = self.text + "\n"
107 self.inparagraph = False
108 self.inblockquote = False
109 elif tag.lower() == "ul":
112 self.text = self.text + "\n"
113 elif tag.lower() == "li" and self.inul:
118 self.text = self.text \
120 + u'\n '.join([a.strip() for a in \
121 textwrap.wrap(self.item, 67)]) \
125 def handle_startendtag(self, tag, attrs):
126 if tag.lower() == "br":
131 self.text = self.text \
134 for a in textwrap.wrap( \
135 self.currentparagraph, 70) \
139 self.currentparagraph = u''
140 elif self.inblockquote:
141 self.text = self.text \
145 for a in textwrap.wrap( \
146 self.blockquote.encode("utf-8") \
151 self.blockquote = u''
153 self.text = self.text + "\n"
155 def handle_endtag(self, tag):
156 if tag.lower() == "h1":
157 self.inheadingone = False
158 self.text = self.text \
160 + self.headingtext.encode("utf-8") \
162 + u'=' * len(self.headingtext.encode("utf-8").strip())
163 self.headingtext = u''
164 elif tag.lower() == "h2":
165 self.inheadingtwo = False
166 self.text = self.text \
168 + self.headingtext.encode("utf-8") \
170 + u'-' * len(self.headingtext.encode("utf-8").strip())
171 self.headingtext = u''
172 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
173 self.inotherheading = False
174 self.text = self.text \
176 + self.headingtext.encode("utf-8") \
178 + u'~' * len(self.headingtext.encode("utf-8").strip())
179 self.headingtext = u''
180 elif tag.lower() == "p":
181 self.text = self.text \
182 + u'\n'.join(textwrap.wrap( \
183 self.currentparagraph, 70) \
185 self.inparagraph = False
186 self.currentparagraph = u''
187 elif tag.lower() == "blockquote":
188 self.text = self.text \
192 for a in textwrap.wrap( \
193 self.blockquote, 68)] \
196 self.inblockquote = False
197 self.blockquote = u''
198 elif tag.lower() == "pre":
200 elif tag.lower() == "li":
203 self.text = self.text \
206 [a.strip() for a in textwrap.wrap(self.item, 67)]) \
209 elif tag.lower() == "ul":
212 def handle_data(self, data):
213 if self.inheadingone or self.inheadingtwo or self.inotherheading:
214 self.headingtext = self.headingtext \
215 + unicode(data, "utf-8").strip() \
217 elif self.inblockquote:
218 self.blockquote = self.blockquote \
219 + unicode(data, "utf-8").strip() \
221 elif self.inparagraph:
222 self.currentparagraph = self.currentparagraph \
223 + unicode(data, "utf-8").strip() \
225 elif self.inul and self.initem:
226 self.item = self.item + unicode(data, "utf-8")
228 self.text = self.text + unicode(data, "utf-8")
230 self.text = self.text + unicode(data, "utf-8").strip() + u' '
232 def handle_entityref(self, name):
234 if entities.has_key(name.lower()):
235 entity = entities[name.lower()]
237 entity = unichr(int(name[1:]))
239 entity = "&" + name + ";"
242 self.currentparagraph = self.currentparagraph \
243 + unicode(entity, "utf-8")
244 elif self.inblockquote:
245 self.blockquote = self.blockquote + unicode(entity, "utf-8")
247 self.text = self.text + unicode(entity, "utf-8")
252 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
255 def open_url(method, url):
257 while redirectcount < 3:
258 (type, rest) = urllib.splittype(url)
259 (host, path) = urllib.splithost(rest)
260 (host, port) = urllib.splitport(host)
264 conn = httplib.HTTPConnection("%s:%s" %(host, port))
265 conn.request(method, path)
266 response = conn.getresponse()
267 if response.status in [301, 302, 303, 307]:
268 headers = response.getheaders()
269 for header in headers:
270 if header[0] == "location":
272 elif response.status == 200:
276 redirectcount = redirectcount + 1
279 def parse_and_deliver(maildir, url, statedir):
282 # first check if we know about this feed already
283 feeddb = dbm.open(os.path.join(statedir, "feeds"), "c")
284 if feeddb.has_key(url):
286 data = cgi.parse_qs(data)
287 response = open_url("HEAD", url)
290 headers = response.getheaders()
293 for header in headers:
294 if header[0] == "content-length":
295 if header[1] != data["content-length"][0]:
297 elif header[0] == "etag":
298 if header[1] != data["etag"][0]:
300 elif header[0] == "last-modified":
301 if header[1] != data["last-modified"][0]:
303 elif header[0] == "content-md5":
304 if header[1] != data["content-md5"][0]:
309 response = open_url("GET", url)
311 headers = response.getheaders()
312 feedhandle = response
314 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
317 return # don't need to do anything, nothings changed.
319 response = open_url("GET", url)
321 headers = response.getheaders()
322 feedhandle = response
324 sys.stderr.write("Failed to fetch feed: %s\n" %(url))
327 fp = feedparser.parse(feedhandle)
328 db = dbm.open(os.path.join(statedir, "seen"), "c")
329 for item in fp["items"]:
330 # have we seen it before?
331 # need to work out what the content is first...
333 if item.has_key("content"):
334 content = item["content"][0]["value"]
336 content = item["summary"]
338 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
342 # check if there's a guid too - if that exists and we match the md5,
344 if item.has_key("guid"):
345 if db.has_key(url + "|" + item["guid"]):
346 data = db[url + "|" + item["guid"]]
347 data = cgi.parse_qs(data)
348 if data["contentmd5"][0] == md5sum:
351 if db.has_key(url + "|" + item["link"]):
352 data = db[url + "|" + item["link"]]
353 data = cgi.parse_qs(data)
354 if data.has_key("message-id"):
355 prevmessageid = data["message-id"][0]
356 if data["contentmd5"][0] == md5sum:
360 author = item["author"]
364 # create a basic email message
365 msg = MIMEMultipart("alternative")
367 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
371 string.ascii_letters + string.digits \
372 ) for a in range(0,6) \
373 ]) + "@" + socket.gethostname() + ">"
374 msg.add_header("Message-ID", messageid)
375 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
376 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
377 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
379 msg.add_header("References", prevmessageid)
380 createddate = datetime.datetime.now() \
381 .strftime("%a, %e %b %Y %T -0000")
383 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
384 .strftime("%a, %e %b %Y %T -0000")
387 msg.add_header("Date", createddate)
388 msg.add_header("Subject", item["title"])
389 msg.set_default_type("text/plain")
391 htmlcontent = content.encode("utf-8")
392 htmlcontent = "%s\n\n<p>Item URL: <a href='%s'>%s</a></p>" %( \
396 htmlpart = MIMEText(htmlcontent.encode("utf-8"), "html", "utf-8")
397 textparser = HTML2Text()
398 textparser.feed(content.encode("utf-8"))
399 textcontent = textparser.gettext()
400 textcontent = "%s\n\nItem URL: %s" %( \
403 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
407 # start by working out the filename we should be writting to, we do
408 # this following the normal maildir style rules
409 fname = str(os.getpid()) \
410 + "." + socket.gethostname() \
413 string.ascii_letters + string.digits \
414 ) for a in range(0,10) \
416 + datetime.datetime.now().strftime('%s')
417 fn = os.path.join(maildir, "tmp", fname)
419 fh.write(msg.as_string())
421 # now move it in to the new directory
422 newfn = os.path.join(maildir, "new", fname)
426 # now add to the database about the item
428 messageid = prevmessageid + " " + messageid
429 if item.has_key("guid") and item["guid"] != item["link"]:
430 data = urllib.urlencode(( \
431 ("message-id", messageid), \
432 ("created", createddate), \
433 ("contentmd5", md5sum) \
435 db[url + "|" + item["guid"]] = data
437 data = db[url + "|" + item["link"]]
438 data = cgi.parse_qs(data)
439 newdata = urllib.urlencode(( \
440 ("message-id", messageid), \
441 ("created", data["created"][0]), \
442 ("contentmd5", data["contentmd5"][0]) \
444 db[url + "|" + item["link"]] = newdata
446 db[url + "|" + item["link"]] = data
448 data = urllib.urlencode(( \
449 ("message-id", messageid), \
450 ("created", createddate), \
451 ("contentmd5", md5sum) \
453 db[url + "|" + item["link"]] = data
457 for header in headers:
458 if header[0] in ["content-md5", "etag", "last-modified", "content-length"]:
459 data.append((header[0], header[1]))
461 data = urllib.urlencode(data)
467 # first off, parse the command line arguments
469 oparser = OptionParser()
471 "-c", "--conf", dest="conf",
472 help="location of config file"
475 "-s", "--statedir", dest="statedir",
476 help="location of directory to store state in"
479 (options, args) = oparser.parse_args()
481 # check for the configfile
485 if options.conf != None:
486 # does the file exist?
488 os.stat(options.conf)
489 configfile = options.conf
491 # should exit here as the specified file doesn't exist
493 "Config file %s does not exist. Exiting.\n" %(options.conf,))
496 # check through the default locations
498 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
499 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
502 os.stat("/etc/rss2maildir.conf")
503 configfile = "/etc/rss2maildir.conf"
505 sys.stderr.write("No config file found. Exiting.\n")
508 # Right - if we've got this far, we've got a config file, now for the hard
511 scp = SafeConfigParser()
514 maildir_root = "RSSMaildir"
517 if options.statedir != None:
518 state_dir = options.statedir
520 mode = os.stat(state_dir)[stat.ST_MODE]
521 if not stat.S_ISDIR(mode):
523 "State directory (%s) is not a directory\n" %(state_dir))
526 # try to make the directory
530 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
532 elif scp.has_option("general", "state_dir"):
533 new_state_dir = scp.get("general", "state_dir")
535 mode = os.stat(state_dir)[stat.ST_MODE]
536 if not stat.S_ISDIR(mode):
538 "State directory (%s) is not a directory\n" %(state_dir))
543 os.mkdir(new_state_dir)
544 state_dir = new_state_dir
547 "Couldn't create state directory %s\n" %(new_state_dir))
551 mode = os.stat(state_dir)[stat.ST_MODE]
552 if not stat.S_ISDIR(mode):
554 "State directory %s is not a directory\n" %(state_dir))
561 "State directory %s could not be created\n" %(state_dir))
564 if scp.has_option("general", "maildir_root"):
565 maildir_root = scp.get("general", "maildir_root")
568 mode = os.stat(maildir_root)[stat.ST_MODE]
569 if not stat.S_ISDIR(mode):
571 "Maildir Root %s is not a directory\n" \
576 os.mkdir(maildir_root)
578 sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
581 feeds = scp.sections()
583 feeds.remove("general")
587 for section in feeds:
588 # check if the directory exists
591 maildir = scp.get(section, "maildir")
595 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
596 maildir = os.path.join(maildir_root, maildir)
599 exists = os.stat(maildir)
600 if stat.S_ISDIR(exists[stat.ST_MODE]):
601 # check if there's a new, cur and tmp directory
603 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
605 os.mkdir(os.path.join(maildir, "cur"))
606 if not stat.S_ISDIR(mode):
607 sys.stderr.write("Broken maildir: %s\n" %(maildir))
609 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
611 os.mkdir(os.path.join(maildir, "tmp"))
612 if not stat.S_ISDIR(mode):
613 sys.stderr.write("Broken maildir: %s\n" %(maildir))
615 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
616 if not stat.S_ISDIR(mode):
617 sys.stderr.write("Broken maildir: %s\n" %(maildir))
619 os.mkdir(os.path.join(maildir, "new"))
621 sys.stderr.write("Broken maildir: %s\n" %(maildir))
626 sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
629 os.mkdir(os.path.join(maildir, "new"))
630 os.mkdir(os.path.join(maildir, "cur"))
631 os.mkdir(os.path.join(maildir, "tmp"))
634 "Couldn't create required maildir directories for %s\n" \
638 # right - we've got the directories, we've got the section, we know the
641 parse_and_deliver(maildir, section, state_dir)