4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
27 from email.MIMEMultipart import MIMEMultipart
28 from email.MIMEText import MIMEText
37 from optparse import OptionParser
38 from ConfigParser import SafeConfigParser
40 from base64 import b64encode
46 from HTMLParser import HTMLParser
59 class HTML2Text(HTMLParser):
62 self.inheadingone = False
63 self.inheadingtwo = False
64 self.inotherheading = False
65 self.inparagraph = True
66 self.inblockquote = False
69 self.currentparagraph = u''
70 self.headingtext = u''
73 HTMLParser.__init__(self)
75 def handle_starttag(self, tag, attrs):
76 if tag.lower() == "h1":
77 self.inheadingone = True
78 self.inparagraph = False
79 elif tag.lower() == "h2":
80 self.inheadingtwo = True
81 self.inparagraph = False
82 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
83 self.inotherheading = True
84 self.inparagraph = False
85 elif tag.lower() == "a":
87 elif tag.lower() == "br":
89 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)).encode('utf-8') + "\n"
90 self.currentparagraph = ""
91 elif self.inblockquote:
92 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]).encode("utf-8") + "\n"
95 self.text = self.text + "\n"
96 elif tag.lower() == "blockquote":
97 self.inblockquote = True
98 self.text = self.text + "\n"
99 elif tag.lower() == "p":
101 self.text = self.text + "\n\n"
103 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)).encode("utf-8")
104 self.currentparagraph = u''
105 self.inparagraph = True
106 elif tag.lower() == "pre":
107 self.text = self.text + "\n"
109 self.inparagraph = False
110 self.inblockquote = False
112 def handle_startendtag(self, tag, attrs):
113 if tag.lower() == "br":
115 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)).encode("utf-8") + "\n"
116 self.currentparagraph = u''
117 elif self.inblockquote:
118 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]).encode("utf-8") + "\n"
121 self.text = self.text + "\n"
123 def handle_endtag(self, tag):
124 if tag.lower() == "h1":
125 self.inheadingone = False
126 self.text = self.text + "\n\n" + self.headingtext + "\n" + "=" * len(self.headingtext.strip())
127 self.headingtext = u''
128 elif tag.lower() == "h2":
129 self.inheadingtwo = False
130 self.text = self.text + "\n\n" + self.headingtext + "\n" + "-" * len(self.headingtext.strip())
131 self.headingtext = u''
132 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
133 self.inotherheading = False
134 self.text = self.text + "\n\n" + self.headingtext + "\n" + "~" * len(self.headingtext.strip())
135 self.headingtext = u''
136 elif tag.lower() == "p":
137 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70))
138 self.inparagraph = False
139 elif tag.lower() == "blockquote":
140 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]).encode("utf-8") + "\n"
141 self.inblockquote = False
142 self.blockquote = u''
143 elif tag.lower() == "pre":
146 def handle_data(self, data):
147 if self.inheadingone or self.inheadingtwo or self.inotherheading:
148 self.headingtext = self.headingtext + unicode(data, "utf-8").strip() + u' '
149 elif self.inblockquote:
150 self.blockquote = self.blockquote + unicode(data, "utf-8").strip() + u' '
151 elif self.inparagraph:
152 self.currentparagraph = self.currentparagraph + unicode(data, "utf-8").strip() + u' '
154 self.text = self.text + data.encode("utf-8")
156 self.text = self.text + unicode(data, "utf-8").strip() + u' '
158 def handle_entityref(self, name):
160 if entities.has_key(name.lower()):
161 entity = entities[name.lower()]
163 entity = unichr(int(name[1:]))
165 entity = "&" + name + ";"
168 self.currentparagraph = self.currentparagraph + entity
169 elif self.inblockquote:
170 self.blockquote = self.blockquote + entity
172 self.text = self.text + entity
177 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
180 def parse_and_deliver(maildir, url, statedir):
181 fp = feedparser.parse(url)
182 db = dbm.open(os.path.join(statedir, "seen"), "c")
183 for item in fp["items"]:
184 # have we seen it before?
185 # need to work out what the content is first...
187 if item.has_key("content"):
188 content = item["content"][0]["value"]
190 content = item["summary"]
192 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
194 if db.has_key(url + "|" + item["link"]):
195 data = db[url + "|" + item["link"]]
196 data = cgi.parse_qs(data)
197 if data["contentmd5"][0] == md5sum:
201 author = item["author"]
205 # create a basic email message
206 msg = MIMEMultipart("alternative")
207 messageid = "<" + datetime.datetime.now().strftime("%Y%m%d%H%M") + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,6)]) + "@" + socket.gethostname() + ">"
208 msg.add_header("Message-ID", messageid)
209 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
210 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
211 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
212 createddate = datetime.datetime(*item["updated_parsed"][0:6]).strftime("%a, %e %b %Y %T -0000")
213 msg.add_header("Date", createddate)
214 msg.add_header("Subject", item["title"])
215 msg.set_default_type("text/plain")
217 htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
218 textparser = HTML2Text()
219 textparser.feed(content.encode("utf-8"))
220 textcontent = textparser.gettext()
221 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
225 # start by working out the filename we should be writting to, we do
226 # this following the normal maildir style rules
227 fname = str(os.getpid()) + "." + socket.gethostname() + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,10)]) + "." + datetime.datetime.now().strftime('%s')
228 fn = os.path.join(maildir, "tmp", fname)
230 fh.write(msg.as_string())
232 # now move it in to the new directory
233 newfn = os.path.join(maildir, "new", fname)
237 # now add to the database about the item
238 data = urllib.urlencode((("message-id", messageid), ("created", createddate), ("contentmd5", md5sum)))
239 db[url + "|" + item["link"]] = data
243 # first off, parse the command line arguments
245 oparser = OptionParser()
247 "-c", "--conf", dest="conf",
248 help="location of config file"
251 "-s", "--statedir", dest="statedir",
252 help="location of directory to store state in"
255 (options, args) = oparser.parse_args()
257 # check for the configfile
261 if options.conf != None:
262 # does the file exist?
264 os.stat(options.conf)
265 configfile = options.conf
267 # should exit here as the specified file doesn't exist
268 sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
271 # check through the default locations
273 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
274 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
277 os.stat("/etc/rss2maildir.conf")
278 configfile = "/etc/rss2maildir.conf"
280 sys.stderr.write("No config file found. Exiting.\n")
283 # Right - if we've got this far, we've got a config file, now for the hard
286 scp = SafeConfigParser()
289 maildir_root = "RSSMaildir"
292 if options.statedir != None:
293 state_dir = options.statedir
295 mode = os.stat(state_dir)[stat.ST_MODE]
296 if not stat.S_ISDIR(mode):
297 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
300 # try to make the directory
304 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
306 elif scp.has_option("general", "state_dir"):
307 new_state_dir = scp.get("general", "state_dir")
309 mode = os.stat(state_dir)[stat.ST_MODE]
310 if not stat.S_ISDIR(mode):
311 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
316 os.mkdir(new_state_dir)
317 state_dir = new_state_dir
319 sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
323 mode = os.stat(state_dir)[stat.ST_MODE]
324 if not stat.S_ISDIR(mode):
325 sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
331 sys.stderr.write("State directory %s could not be created\n" %(state_dir))
334 if scp.has_option("general", "maildir_root"):
335 maildir_root = scp.get("general", "maildir_root")
338 mode = os.stat(maildir_root)[stat.ST_MODE]
339 if not stat.S_ISDIR(mode):
340 sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
344 os.mkdir(maildir_root)
346 sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
349 feeds = scp.sections()
351 feeds.remove("general")
355 for section in feeds:
356 # check if the directory exists
359 maildir = scp.get(section, "maildir")
363 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
364 maildir = os.path.join(maildir_root, maildir)
367 exists = os.stat(maildir)
368 if stat.S_ISDIR(exists[stat.ST_MODE]):
369 # check if there's a new, cur and tmp directory
371 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
373 os.mkdir(os.path.join(maildir, "cur"))
374 if not stat.S_ISDIR(mode):
375 sys.stderr.write("Broken maildir: %s\n" %(maildir))
377 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
379 os.mkdir(os.path.join(maildir, "tmp"))
380 if not stat.S_ISDIR(mode):
381 sys.stderr.write("Broken maildir: %s\n" %(maildir))
383 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
384 if not stat.S_ISDIR(mode):
385 sys.stderr.write("Broken maildir: %s\n" %(maildir))
387 os.mkdir(os.path.join(maildir, "new"))
389 sys.stderr.write("Broken maildir: %s\n" %(maildir))
394 sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
397 os.mkdir(os.path.join(maildir, "new"))
398 os.mkdir(os.path.join(maildir, "cur"))
399 os.mkdir(os.path.join(maildir, "tmp"))
401 sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
404 # right - we've got the directories, we've got the section, we know the
407 parse_and_deliver(maildir, section, state_dir)