4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
60 class HTML2Text(HTMLParser):
63 self.inheadingone = False
64 self.inheadingtwo = False
65 self.inotherheading = False
66 self.inparagraph = True
67 self.inblockquote = False
70 self.currentparagraph = ""
74 HTMLParser.__init__(self)
76 def handle_starttag(self, tag, attrs):
77 if tag.lower() == "h1":
78 self.inheadingone = True
79 self.inparagraph = False
80 elif tag.lower() == "h2":
81 self.inheadingtwo = True
82 self.inparagraph = False
83 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
84 self.inotherheading = True
85 self.inparagraph = False
86 elif tag.lower() == "a":
88 elif tag.lower() == "br":
90 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)) + "\n"
91 self.currentparagraph = ""
92 elif self.inblockquote:
93 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]) + "\n"
96 self.text = self.text + "\n"
97 elif tag.lower() == "blockquote":
98 self.inblockquote = True
99 self.text = self.text + "\n"
100 elif tag.lower() == "p":
102 self.text = self.text + "\n\n"
104 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70))
105 self.currentparagraph = ""
106 self.inparagraph = True
107 elif tag.lower() == "pre":
108 self.text = self.text + "\n"
110 self.inparagraph = False
111 self.inblockquote = False
113 def handle_startendtag(self, tag, attrs):
114 if tag.lower() == "br":
116 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70)) + "\n"
117 self.currentparagraph = ""
118 elif self.inblockquote:
119 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]) + "\n"
122 self.text = self.text + "\n"
124 def handle_endtag(self, tag):
125 if tag.lower() == "h1":
126 self.inheadingone = False
127 self.text = self.text + "\n\n" + self.headingtext + "\n" + "=" * len(self.headingtext.strip())
128 self.headingtext = ""
129 elif tag.lower() == "h2":
130 self.inheadingtwo = False
131 self.text = self.text + "\n\n" + self.headingtext + "\n" + "-" * len(self.headingtext.strip())
132 self.headingtext = ""
133 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
134 self.inotherheading = False
135 self.text = self.text + "\n\n" + self.headingtext + "\n" + "~" * len(self.headingtext.strip())
136 self.headingtext = ""
137 elif tag.lower() == "p":
138 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70))
139 self.inparagraph = False
140 elif tag.lower() == "blockquote":
141 self.text = self.text + "\n> " + "\n> ".join([a.strip() for a in textwrap.wrap(self.blockquote, 68)]) + "\n"
142 self.inblockquote = False
144 elif tag.lower() == "pre":
147 def handle_data(self, data):
148 if self.inheadingone or self.inheadingtwo or self.inotherheading:
149 self.headingtext = self.headingtext + data.strip() + " "
150 elif self.inblockquote:
151 self.blockquote = self.blockquote + data.strip() + " "
152 elif self.inparagraph:
153 self.currentparagraph = self.currentparagraph + data.strip() + " "
155 self.text = self.text + data
157 self.text = self.text + data.strip() + " "
159 def handle_entityref(self, name):
160 if entities.has_key(name.lower()):
161 self.text = self.text + entities[name.lower()]
163 self.text = self.text + "&" + name + ";"
168 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
171 def parse_and_deliver(maildir, url, statedir):
172 md = mailbox.Maildir(maildir)
173 fp = feedparser.parse(url)
174 db = dbm.open(os.path.join(statedir, "seen"), "c")
175 for item in fp["items"]:
176 # have we seen it before?
177 # need to work out what the content is first...
179 if item.has_key("content"):
180 content = item["content"][0]["value"]
182 content = item["summary"]
184 md5sum = md5.md5(content.encode("utf8")).hexdigest()
186 if db.has_key(url + "|" + item["link"]):
187 data = db[url + "|" + item["link"]]
188 data = cgi.parse_qs(data)
189 if data["contentmd5"][0] == md5sum:
193 author = item["author"]
197 # create a basic email message
198 msg = MIMEMultipart("alternative")
199 messageid = "<" + datetime.datetime.now().strftime("%Y%m%d%H%M") + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,6)]) + "@" + socket.gethostname() + ">"
200 msg.add_header("Message-ID", messageid)
201 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
202 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
203 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
204 createddate = datetime.datetime(*item["updated_parsed"][0:6]).strftime("%a, %e %b %Y %T -0000")
205 msg.add_header("Date", createddate)
206 msg.add_header("Subject", item["title"])
207 msg.set_default_type("text/plain")
209 htmlpart = MIMEText(content.encode("utf8"), "html", "utf8")
210 textparser = HTML2Text()
211 textparser.feed(content.encode("utf8"))
212 textcontent = textparser.gettext()
213 textpart = MIMEText(textcontent, "plain", "utf8")
217 # start by working out the filename we should be writting to, we do
218 # this following the normal maildir style rules
219 fname = str(os.getpid()) + "." + socket.gethostname() + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,10)]) + "." + datetime.datetime.now().strftime('%s')
220 fn = os.path.join(maildir, "tmp", fname)
222 fh.write(msg.as_string())
224 # now move it in to the new directory
225 newfn = os.path.join(maildir, "new", fname)
229 # now add to the database about the item
230 data = urllib.urlencode((("message-id", messageid), ("created", createddate), ("contentmd5", md5sum)))
231 db[url + "|" + item["link"]] = data
235 # first off, parse the command line arguments
237 oparser = OptionParser()
239 "-c", "--conf", dest="conf",
240 help="location of config file"
243 "-s", "--statedir", dest="statedir",
244 help="location of directory to store state in"
247 (options, args) = oparser.parse_args()
249 # check for the configfile
253 if options.conf != None:
254 # does the file exist?
256 os.stat(options.conf)
257 configfile = options.conf
259 # should exit here as the specified file doesn't exist
260 sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
263 # check through the default locations
265 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
266 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
269 os.stat("/etc/rss2maildir.conf")
270 configfile = "/etc/rss2maildir.conf"
272 sys.stderr.write("No config file found. Exiting.\n")
275 # Right - if we've got this far, we've got a config file, now for the hard
278 scp = SafeConfigParser()
281 maildir_root = "RSSMaildir"
284 if options.statedir != None:
285 state_dir = options.statedir
287 mode = os.stat(state_dir)[stat.ST_MODE]
288 if not stat.S_ISDIR(mode):
289 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
292 # try to make the directory
296 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
298 elif scp.has_option("general", "state_dir"):
299 new_state_dir = scp.get("general", "state_dir")
301 mode = os.stat(state_dir)[stat.ST_MODE]
302 if not stat.S_ISDIR(mode):
303 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
308 os.mkdir(new_state_dir)
309 state_dir = new_state_dir
311 sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
315 mode = os.stat(state_dir)[stat.ST_MODE]
316 if not stat.S_ISDIR(mode):
317 sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
323 sys.stderr.write("State directory %s could not be created\n" %(state_dir))
326 if scp.has_option("general", "maildir_root"):
327 maildir_root = scp.get("general", "maildir_root")
330 mode = os.stat(maildir_root)[stat.ST_MODE]
331 if not stat.S_ISDIR(mode):
332 sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
336 os.mkdir(maildir_root)
338 sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
341 feeds = scp.sections()
343 feeds.remove("general")
347 for section in feeds:
348 # check if the directory exists
351 maildir = scp.get(section, "maildir")
355 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
356 maildir = os.path.join(maildir_root, maildir)
359 exists = os.stat(maildir)
360 if stat.S_ISDIR(exists[stat.ST_MODE]):
361 # check if there's a new, cur and tmp directory
363 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
365 os.mkdir(os.path.join(maildir, "cur"))
366 if not stat.S_ISDIR(mode):
367 sys.stderr.write("Broken maildir: %s\n" %(maildir))
369 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
371 os.mkdir(os.path.join(maildir, "tmp"))
372 if not stat.S_ISDIR(mode):
373 sys.stderr.write("Broken maildir: %s\n" %(maildir))
375 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
376 if not stat.S_ISDIR(mode):
377 sys.stderr.write("Broken maildir: %s\n" %(maildir))
379 os.mkdir(os.path.join(maildir, "new"))
381 sys.stderr.write("Broken maildir: %s\n" %(maildir))
386 sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
389 os.mkdir(os.path.join(maildir, "new"))
390 os.mkdir(os.path.join(maildir, "cur"))
391 os.mkdir(os.path.join(maildir, "tmp"))
393 sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
396 # right - we've got the directories, we've got the section, we know the
399 parse_and_deliver(maildir, section, state_dir)