4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
27 from email.MIMEMultipart import MIMEMultipart
28 from email.MIMEText import MIMEText
37 from optparse import OptionParser
38 from ConfigParser import SafeConfigParser
40 from base64 import b64encode
46 from HTMLParser import HTMLParser
59 class HTML2Text(HTMLParser):
62 self.inheadingone = False
63 self.inheadingtwo = False
64 self.inotherheading = False
65 self.inparagraph = True
66 self.inblockquote = False
69 self.currentparagraph = u''
70 self.headingtext = u''
76 HTMLParser.__init__(self)
78 def handle_starttag(self, tag, attrs):
79 if tag.lower() == "h1":
80 self.inheadingone = True
81 self.inparagraph = False
82 elif tag.lower() == "h2":
83 self.inheadingtwo = True
84 self.inparagraph = False
85 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
86 self.inotherheading = True
87 self.inparagraph = False
88 elif tag.lower() == "a":
90 elif tag.lower() == "br":
92 self.text = self.text \
94 textwrap.wrap(self.currentparagraph, 70)) \
96 self.currentparagraph = ""
97 elif self.inblockquote:
98 self.text = self.text \
102 for a in textwrap.wrap(self.blockquote, 68) \
105 self.blockquote = u''
107 self.text = self.text + u'\n'
108 elif tag.lower() == "blockquote":
109 self.inblockquote = True
110 self.text = self.text + u'\n'
111 elif tag.lower() == "p":
113 self.text = self.text + u'\n\n'
115 self.text = self.text \
116 + u'\n'.join(textwrap.wrap(self.currentparagraph, 70))
117 self.currentparagraph = u''
118 self.inparagraph = True
119 elif tag.lower() == "pre":
120 self.text = self.text + "\n"
122 self.inparagraph = False
123 self.inblockquote = False
124 elif tag.lower() == "ul":
127 self.text = self.text + "\n"
128 elif tag.lower() == "li" and self.inul:
133 self.text = self.text \
135 + u'\n '.join([a.strip() for a in textwrap.wrap(self.item, 67)]) \
139 def handle_startendtag(self, tag, attrs):
140 if tag.lower() == "br":
142 self.text = self.text \
145 for a in textwrap.wrap( \
146 self.currentparagraph, 70) \
150 self.currentparagraph = u''
151 elif self.inblockquote:
152 self.text = self.text \
156 for a in textwrap.wrap( \
157 self.blockquote.encode("utf-8") \
162 self.blockquote = u''
164 self.text = self.text + "\n"
166 def handle_endtag(self, tag):
167 if tag.lower() == "h1":
168 self.inheadingone = False
169 self.text = self.text \
171 + self.headingtext.encode("utf-8") \
173 + u'=' * len(self.headingtext.encode("utf-8").strip())
174 self.headingtext = u''
175 elif tag.lower() == "h2":
176 self.inheadingtwo = False
177 self.text = self.text \
179 + self.headingtext.encode("utf-8") \
181 + u'-' * len(self.headingtext.encode("utf-8").strip())
182 self.headingtext = u''
183 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
184 self.inotherheading = False
185 self.text = self.text \
187 + self.headingtext.encode("utf-8") \
189 + u'~' * len(self.headingtext.encode("utf-8").strip())
190 self.headingtext = u''
191 elif tag.lower() == "p":
192 self.text = self.text \
193 + u'\n'.join(textwrap.wrap( \
194 self.currentparagraph, 70) \
196 self.inparagraph = False
197 self.currentparagraph = u''
198 elif tag.lower() == "blockquote":
199 self.text = self.text \
203 for a in textwrap.wrap( \
204 self.blockquote, 68)] \
207 self.inblockquote = False
208 self.blockquote = u''
209 elif tag.lower() == "pre":
211 elif tag.lower() == "li":
214 self.text = self.text \
217 [a.strip() for a in textwrap.wrap(self.item, 67)]) \
220 elif tag.lower() == "ul":
223 def handle_data(self, data):
224 if self.inheadingone or self.inheadingtwo or self.inotherheading:
225 self.headingtext = self.headingtext \
226 + unicode(data, "utf-8").strip() \
228 elif self.inblockquote:
229 self.blockquote = self.blockquote \
230 + unicode(data, "utf-8").strip() \
232 elif self.inparagraph:
233 self.currentparagraph = self.currentparagraph \
234 + unicode(data, "utf-8").strip() \
236 elif self.inul and self.initem:
237 self.item = self.item + unicode(data, "utf-8")
239 self.text = self.text + unicode(data, "utf-8")
241 self.text = self.text + unicode(data, "utf-8").strip() + u' '
243 def handle_entityref(self, name):
245 if entities.has_key(name.lower()):
246 entity = entities[name.lower()]
248 entity = unichr(int(name[1:]))
250 entity = "&" + name + ";"
253 self.currentparagraph = self.currentparagraph + unicode(entity, "utf-8")
254 elif self.inblockquote:
255 self.blockquote = self.blockquote + unicode(entity, "utf-8")
257 self.text = self.text + unicode(entity, "utf-8")
262 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
265 def parse_and_deliver(maildir, url, statedir):
266 fp = feedparser.parse(url)
267 db = dbm.open(os.path.join(statedir, "seen"), "c")
268 for item in fp["items"]:
269 # have we seen it before?
270 # need to work out what the content is first...
272 if item.has_key("content"):
273 content = item["content"][0]["value"]
275 content = item["summary"]
277 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
279 if db.has_key(url + "|" + item["link"]):
280 data = db[url + "|" + item["link"]]
281 data = cgi.parse_qs(data)
282 if data["contentmd5"][0] == md5sum:
286 author = item["author"]
290 # create a basic email message
291 msg = MIMEMultipart("alternative")
293 + datetime.datetime.now().strftime("%Y%m%d%H%M") \
297 string.ascii_letters + string.digits \
298 ) for a in range(0,6) \
299 ]) + "@" + socket.gethostname() + ">"
300 msg.add_header("Message-ID", messageid)
301 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
302 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
303 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
304 createddate = datetime.datetime(*item["updated_parsed"][0:6]) \
305 .strftime("%a, %e %b %Y %T -0000")
306 msg.add_header("Date", createddate)
307 msg.add_header("Subject", item["title"])
308 msg.set_default_type("text/plain")
310 htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
311 textparser = HTML2Text()
312 textparser.feed(content.encode("utf-8"))
313 textcontent = textparser.gettext()
314 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
318 # start by working out the filename we should be writting to, we do
319 # this following the normal maildir style rules
320 fname = str(os.getpid()) \
321 + "." + socket.gethostname() \
324 string.ascii_letters + string.digits \
325 ) for a in range(0,10) \
327 + datetime.datetime.now().strftime('%s')
328 fn = os.path.join(maildir, "tmp", fname)
330 fh.write(msg.as_string())
332 # now move it in to the new directory
333 newfn = os.path.join(maildir, "new", fname)
337 # now add to the database about the item
338 data = urllib.urlencode((
339 ("message-id", messageid), \
340 ("created", createddate), \
341 ("contentmd5", md5sum) \
343 db[url + "|" + item["link"]] = data
347 # first off, parse the command line arguments
349 oparser = OptionParser()
351 "-c", "--conf", dest="conf",
352 help="location of config file"
355 "-s", "--statedir", dest="statedir",
356 help="location of directory to store state in"
359 (options, args) = oparser.parse_args()
361 # check for the configfile
365 if options.conf != None:
366 # does the file exist?
368 os.stat(options.conf)
369 configfile = options.conf
371 # should exit here as the specified file doesn't exist
372 sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
375 # check through the default locations
377 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
378 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
381 os.stat("/etc/rss2maildir.conf")
382 configfile = "/etc/rss2maildir.conf"
384 sys.stderr.write("No config file found. Exiting.\n")
387 # Right - if we've got this far, we've got a config file, now for the hard
390 scp = SafeConfigParser()
393 maildir_root = "RSSMaildir"
396 if options.statedir != None:
397 state_dir = options.statedir
399 mode = os.stat(state_dir)[stat.ST_MODE]
400 if not stat.S_ISDIR(mode):
401 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
404 # try to make the directory
408 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
410 elif scp.has_option("general", "state_dir"):
411 new_state_dir = scp.get("general", "state_dir")
413 mode = os.stat(state_dir)[stat.ST_MODE]
414 if not stat.S_ISDIR(mode):
415 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
420 os.mkdir(new_state_dir)
421 state_dir = new_state_dir
423 sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
427 mode = os.stat(state_dir)[stat.ST_MODE]
428 if not stat.S_ISDIR(mode):
429 sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
435 sys.stderr.write("State directory %s could not be created\n" %(state_dir))
438 if scp.has_option("general", "maildir_root"):
439 maildir_root = scp.get("general", "maildir_root")
442 mode = os.stat(maildir_root)[stat.ST_MODE]
443 if not stat.S_ISDIR(mode):
444 sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
448 os.mkdir(maildir_root)
450 sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
453 feeds = scp.sections()
455 feeds.remove("general")
459 for section in feeds:
460 # check if the directory exists
463 maildir = scp.get(section, "maildir")
467 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
468 maildir = os.path.join(maildir_root, maildir)
471 exists = os.stat(maildir)
472 if stat.S_ISDIR(exists[stat.ST_MODE]):
473 # check if there's a new, cur and tmp directory
475 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
477 os.mkdir(os.path.join(maildir, "cur"))
478 if not stat.S_ISDIR(mode):
479 sys.stderr.write("Broken maildir: %s\n" %(maildir))
481 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
483 os.mkdir(os.path.join(maildir, "tmp"))
484 if not stat.S_ISDIR(mode):
485 sys.stderr.write("Broken maildir: %s\n" %(maildir))
487 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
488 if not stat.S_ISDIR(mode):
489 sys.stderr.write("Broken maildir: %s\n" %(maildir))
491 os.mkdir(os.path.join(maildir, "new"))
493 sys.stderr.write("Broken maildir: %s\n" %(maildir))
498 sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
501 os.mkdir(os.path.join(maildir, "new"))
502 os.mkdir(os.path.join(maildir, "cur"))
503 os.mkdir(os.path.join(maildir, "tmp"))
505 sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
508 # right - we've got the directories, we've got the section, we know the
511 parse_and_deliver(maildir, section, state_dir)