4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
60 class HTML2Text(HTMLParser):
63 self.inheadingone = False
64 self.inheadingtwo = False
65 self.inotherheading = False
66 self.inparagraph = True
67 self.inblockquote = False
70 self.currentparagraph = u''
71 self.headingtext = u''
74 HTMLParser.__init__(self)
76 def handle_starttag(self, tag, attrs):
77 if tag.lower() == "h1":
78 self.inheadingone = True
79 self.inparagraph = False
80 elif tag.lower() == "h2":
81 self.inheadingtwo = True
82 self.inparagraph = False
83 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
84 self.inotherheading = True
85 self.inparagraph = False
86 elif tag.lower() == "a":
88 elif tag.lower() == "br":
90 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)).encode('utf-8') + "\n"
91 self.currentparagraph = ""
92 elif self.inblockquote:
93 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]).encode("utf-8") + "\n"
96 self.text = self.text + "\n"
97 elif tag.lower() == "blockquote":
98 self.inblockquote = True
99 self.text = self.text + "\n"
100 elif tag.lower() == "p":
102 self.text = self.text + "\n\n"
104 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)).encode("utf-8")
105 self.currentparagraph = u''
106 self.inparagraph = True
107 elif tag.lower() == "pre":
108 self.text = self.text + "\n"
110 self.inparagraph = False
111 self.inblockquote = False
113 def handle_startendtag(self, tag, attrs):
114 if tag.lower() == "br":
116 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)).encode("utf-8") + "\n"
117 self.currentparagraph = u''
118 elif self.inblockquote:
119 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]).encode("utf-8") + "\n"
122 self.text = self.text + "\n"
124 def handle_endtag(self, tag):
125 if tag.lower() == "h1":
126 self.inheadingone = False
127 self.text = self.text + "\n\n" + self.headingtext + "\n" + "=" * len(self.headingtext.strip())
128 self.headingtext = u''
129 elif tag.lower() == "h2":
130 self.inheadingtwo = False
131 self.text = self.text + "\n\n" + self.headingtext + "\n" + "-" * len(self.headingtext.strip())
132 self.headingtext = u''
133 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
134 self.inotherheading = False
135 self.text = self.text + "\n\n" + self.headingtext + "\n" + "~" * len(self.headingtext.strip())
136 self.headingtext = u''
137 elif tag.lower() == "p":
138 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70))
139 self.inparagraph = False
140 elif tag.lower() == "blockquote":
141 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]).encode("utf-8") + "\n"
142 self.inblockquote = False
143 self.blockquote = u''
144 elif tag.lower() == "pre":
147 def handle_data(self, data):
148 if self.inheadingone or self.inheadingtwo or self.inotherheading:
149 self.headingtext = self.headingtext + unicode(data, "utf-8").strip() + u' '
150 elif self.inblockquote:
151 self.blockquote = self.blockquote + unicode(data, "utf-8").strip() + u' '
152 elif self.inparagraph:
153 self.currentparagraph = self.currentparagraph + unicode(data, "utf-8").strip() + u' '
155 self.text = self.text + data.encode("utf-8")
157 self.text = self.text + unicode(data, "utf-8").strip() + u' '
159 def handle_entityref(self, name):
161 if entities.has_key(name.lower()):
162 entity = entities[name.lower()]
164 entity = unichr(int(name[1:]))
166 entity = "&" + name + ";"
169 self.currentparagraph = self.currentparagraph + entity
170 elif self.inblockquote:
171 self.blockquote = self.blockquote + entity
173 self.text = self.text + entity
178 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
181 def parse_and_deliver(maildir, url, statedir):
182 md = mailbox.Maildir(maildir)
183 fp = feedparser.parse(url)
184 db = dbm.open(os.path.join(statedir, "seen"), "c")
185 for item in fp["items"]:
186 # have we seen it before?
187 # need to work out what the content is first...
189 if item.has_key("content"):
190 content = item["content"][0]["value"]
192 content = item["summary"]
194 md5sum = md5.md5(content.encode("utf-8")).hexdigest()
196 if db.has_key(url + "|" + item["link"]):
197 data = db[url + "|" + item["link"]]
198 data = cgi.parse_qs(data)
199 if data["contentmd5"][0] == md5sum:
203 author = item["author"]
207 # create a basic email message
208 msg = MIMEMultipart("alternative")
209 messageid = "<" + datetime.datetime.now().strftime("%Y%m%d%H%M") + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,6)]) + "@" + socket.gethostname() + ">"
210 msg.add_header("Message-ID", messageid)
211 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
212 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
213 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
214 createddate = datetime.datetime(*item["updated_parsed"][0:6]).strftime("%a, %e %b %Y %T -0000")
215 msg.add_header("Date", createddate)
216 msg.add_header("Subject", item["title"])
217 msg.set_default_type("text/plain")
219 htmlpart = MIMEText(content.encode("utf-8"), "html", "utf-8")
220 textparser = HTML2Text()
221 textparser.feed(content.encode("utf-8"))
222 textcontent = textparser.gettext()
223 textpart = MIMEText(textcontent.encode("utf-8"), "plain", "utf-8")
227 # start by working out the filename we should be writting to, we do
228 # this following the normal maildir style rules
229 fname = str(os.getpid()) + "." + socket.gethostname() + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,10)]) + "." + datetime.datetime.now().strftime('%s')
230 fn = os.path.join(maildir, "tmp", fname)
232 fh.write(msg.as_string())
234 # now move it in to the new directory
235 newfn = os.path.join(maildir, "new", fname)
239 # now add to the database about the item
240 data = urllib.urlencode((("message-id", messageid), ("created", createddate), ("contentmd5", md5sum)))
241 db[url + "|" + item["link"]] = data
245 # first off, parse the command line arguments
247 oparser = OptionParser()
249 "-c", "--conf", dest="conf",
250 help="location of config file"
253 "-s", "--statedir", dest="statedir",
254 help="location of directory to store state in"
257 (options, args) = oparser.parse_args()
259 # check for the configfile
263 if options.conf != None:
264 # does the file exist?
266 os.stat(options.conf)
267 configfile = options.conf
269 # should exit here as the specified file doesn't exist
270 sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
273 # check through the default locations
275 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
276 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
279 os.stat("/etc/rss2maildir.conf")
280 configfile = "/etc/rss2maildir.conf"
282 sys.stderr.write("No config file found. Exiting.\n")
285 # Right - if we've got this far, we've got a config file, now for the hard
288 scp = SafeConfigParser()
291 maildir_root = "RSSMaildir"
294 if options.statedir != None:
295 state_dir = options.statedir
297 mode = os.stat(state_dir)[stat.ST_MODE]
298 if not stat.S_ISDIR(mode):
299 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
302 # try to make the directory
306 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
308 elif scp.has_option("general", "state_dir"):
309 new_state_dir = scp.get("general", "state_dir")
311 mode = os.stat(state_dir)[stat.ST_MODE]
312 if not stat.S_ISDIR(mode):
313 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
318 os.mkdir(new_state_dir)
319 state_dir = new_state_dir
321 sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
325 mode = os.stat(state_dir)[stat.ST_MODE]
326 if not stat.S_ISDIR(mode):
327 sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
333 sys.stderr.write("State directory %s could not be created\n" %(state_dir))
336 if scp.has_option("general", "maildir_root"):
337 maildir_root = scp.get("general", "maildir_root")
340 mode = os.stat(maildir_root)[stat.ST_MODE]
341 if not stat.S_ISDIR(mode):
342 sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
346 os.mkdir(maildir_root)
348 sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
351 feeds = scp.sections()
353 feeds.remove("general")
357 for section in feeds:
358 # check if the directory exists
361 maildir = scp.get(section, "maildir")
365 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
366 maildir = os.path.join(maildir_root, maildir)
369 exists = os.stat(maildir)
370 if stat.S_ISDIR(exists[stat.ST_MODE]):
371 # check if there's a new, cur and tmp directory
373 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
375 os.mkdir(os.path.join(maildir, "cur"))
376 if not stat.S_ISDIR(mode):
377 sys.stderr.write("Broken maildir: %s\n" %(maildir))
379 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
381 os.mkdir(os.path.join(maildir, "tmp"))
382 if not stat.S_ISDIR(mode):
383 sys.stderr.write("Broken maildir: %s\n" %(maildir))
385 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
386 if not stat.S_ISDIR(mode):
387 sys.stderr.write("Broken maildir: %s\n" %(maildir))
389 os.mkdir(os.path.join(maildir, "new"))
391 sys.stderr.write("Broken maildir: %s\n" %(maildir))
396 sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
399 os.mkdir(os.path.join(maildir, "new"))
400 os.mkdir(os.path.join(maildir, "cur"))
401 os.mkdir(os.path.join(maildir, "tmp"))
403 sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
406 # right - we've got the directories, we've got the section, we know the
409 parse_and_deliver(maildir, section, state_dir)