4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
60 class HTML2Text(HTMLParser):
63 self.inheadingone = False
64 self.inheadingtwo = False
65 self.inotherheading = False
66 self.inparagraph = True
67 self.inblockquote = False
70 self.currentparagraph = ""
73 HTMLParser.__init__(self)
75 def handle_starttag(self, tag, attrs):
76 if tag.lower() == "h1":
77 self.inheadingone = True
78 self.inparagraph = False
79 elif tag.lower() == "h2":
80 self.inheadingtwo = True
81 self.inparagraph = False
82 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
83 self.inotherheading = True
84 self.inparagraph = False
85 elif tag.lower() == "a":
87 elif tag.lower() == "br":
89 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70))
90 self.currentparagraph = ""
91 elif self.inblockquote:
92 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)])
95 self.text = self.text + "\n"
96 elif tag.lower() == "blockquote":
97 self.inblockquote = True
98 self.text = self.text + "\n"
99 elif tag.lower() == "p":
101 self.text = self.text + "\n\n"
102 self.currentparagraph = ""
103 self.inparagraph = True
105 def handle_startendtag(self, tag, attrs):
106 if tag.lower() == "br":
108 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70))
109 self.currentparagraph = ""
110 elif self.inblockquote:
111 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)])
114 self.text = self.text + "\n"
116 def handle_endtag(self, tag):
117 if tag.lower() == "h1":
118 self.inheadingone = False
119 self.text = self.text + self.headingtext + "\n" + "=" * len(self.headingtext)
120 self.headingtext = ""
121 elif tag.lower() == "h2":
122 self.inheadingtwo = False
123 self.text = self.text + self.headingtext + "\n" + "-" * len(self.headingtext)
124 self.headingtext = ""
125 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
126 self.inotherheading = False
127 self.text = self.text + self.headingtext + "\n" + "~" * len(self.headingtext)
128 self.headingtext = ""
129 elif tag.lower() == "p":
130 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70))
131 self.inparagraph = False
132 elif tag.lower() == "blockquote":
133 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)])
134 self.inblockquote = False
137 def handle_data(self, data):
138 if self.inheadingone or self.inheadingtwo or self.inotherheading:
139 self.headingtext = self.headingtext + data.strip() + " "
140 elif self.inblockquote:
141 self.blockquote = self.blockquote + data.strip() + " "
142 elif self.inparagraph:
143 self.currentparagraph = self.currentparagraph + data.strip() + " "
145 self.text = self.text + data.strip() + " "
147 def handle_entityref(self, name):
148 if entities.has_key(name.lower()):
149 self.text = self.text + entities[name.lower()]
151 self.text = self.text + "&" + name + ";"
156 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
159 def parse_and_deliver(maildir, url, statedir):
160 md = mailbox.Maildir(maildir)
161 fp = feedparser.parse(url)
162 db = dbm.open(os.path.join(statedir, "seen"), "c")
163 for item in fp["items"]:
164 # have we seen it before?
165 # need to work out what the content is first...
167 if item.has_key("content"):
168 content = item["content"][0]["value"]
170 content = item["summary"]
172 md5sum = md5.md5(content.encode("utf8")).hexdigest()
174 if db.has_key(url + "|" + item["link"]):
175 data = db[url + "|" + item["link"]]
176 data = cgi.parse_qs(data)
177 if data["contentmd5"][0] == md5sum:
181 author = item["author"]
185 # create a basic email message
186 msg = MIMEMultipart("alternative")
187 messageid = "<" + datetime.datetime.now().strftime("%Y%m%d%H%M") + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,6)]) + "@" + socket.gethostname() + ">"
188 msg.add_header("Message-ID", messageid)
189 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
190 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
191 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
192 createddate = datetime.datetime(*item["updated_parsed"][0:6]).strftime("%a, %e %b %Y %T -0000")
193 msg.add_header("Date", createddate)
194 msg.add_header("Subject", item["title"])
195 msg.set_default_type("text/plain")
197 htmlpart = MIMEText(content.encode("utf8"), "html", "utf8")
198 textparser = HTML2Text()
199 textparser.feed(content.encode("utf8"))
200 textcontent = textparser.gettext()
201 textpart = MIMEText(textcontent, "plain", "utf8")
205 # start by working out the filename we should be writting to, we do
206 # this following the normal maildir style rules
207 fname = str(os.getpid()) + "." + socket.gethostname() + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,10)]) + "." + datetime.datetime.now().strftime('%s')
208 fn = os.path.join(maildir, "tmp", fname)
210 fh.write(msg.as_string())
212 # now move it in to the new directory
213 newfn = os.path.join(maildir, "new", fname)
217 # now add to the database about the item
218 data = urllib.urlencode((("message-id", messageid), ("created", createddate), ("contentmd5", md5sum)))
219 db[url + "|" + item["link"]] = data
223 # first off, parse the command line arguments
225 oparser = OptionParser()
227 "-c", "--conf", dest="conf",
228 help="location of config file"
231 "-s", "--statedir", dest="statedir",
232 help="location of directory to store state in"
235 (options, args) = oparser.parse_args()
237 # check for the configfile
241 if options.conf != None:
242 # does the file exist?
244 os.stat(options.conf)
245 configfile = options.conf
247 # should exit here as the specified file doesn't exist
248 sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
251 # check through the default locations
253 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
254 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
257 os.stat("/etc/rss2maildir.conf")
258 configfile = "/etc/rss2maildir.conf"
260 sys.stderr.write("No config file found. Exiting.\n")
263 # Right - if we've got this far, we've got a config file, now for the hard
266 scp = SafeConfigParser()
269 maildir_root = "RSSMaildir"
272 if options.statedir != None:
273 state_dir = options.statedir
275 mode = os.stat(state_dir)[stat.ST_MODE]
276 if not stat.S_ISDIR(mode):
277 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
280 # try to make the directory
284 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
286 elif scp.has_option("general", "state_dir"):
287 new_state_dir = scp.get("general", "state_dir")
289 mode = os.stat(state_dir)[stat.ST_MODE]
290 if not stat.S_ISDIR(mode):
291 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
296 os.mkdir(new_state_dir)
297 state_dir = new_state_dir
299 sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
303 mode = os.stat(state_dir)[stat.ST_MODE]
304 if not stat.S_ISDIR(mode):
305 sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
311 sys.stderr.write("State directory %s could not be created\n" %(state_dir))
314 if scp.has_option("general", "maildir_root"):
315 maildir_root = scp.get("general", "maildir_root")
318 mode = os.stat(maildir_root)[stat.ST_MODE]
319 if not stat.S_ISDIR(mode):
320 sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
324 os.mkdir(maildir_root)
326 sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
329 feeds = scp.sections()
331 feeds.remove("general")
335 for section in feeds:
336 # check if the directory exists
339 maildir = scp.get(section, "maildir")
343 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
344 maildir = os.path.join(maildir_root, maildir)
347 exists = os.stat(maildir)
348 if stat.S_ISDIR(exists[stat.ST_MODE]):
349 # check if there's a new, cur and tmp directory
351 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
353 os.mkdir(os.path.join(maildir, "cur"))
354 if not stat.S_ISDIR(mode):
355 sys.stderr.write("Broken maildir: %s\n" %(maildir))
357 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
359 os.mkdir(os.path.join(maildir, "tmp"))
360 if not stat.S_ISDIR(mode):
361 sys.stderr.write("Broken maildir: %s\n" %(maildir))
363 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
364 if not stat.S_ISDIR(mode):
365 sys.stderr.write("Broken maildir: %s\n" %(maildir))
367 os.mkdir(os.path.join(maildir, "new"))
369 sys.stderr.write("Broken maildir: %s\n" %(maildir))
374 sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
377 os.mkdir(os.path.join(maildir, "new"))
378 os.mkdir(os.path.join(maildir, "cur"))
379 os.mkdir(os.path.join(maildir, "tmp"))
381 sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
384 # right - we've got the directories, we've got the section, we know the
387 parse_and_deliver(maildir, section, state_dir)