4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
37 from optparse import OptionParser
38 from ConfigParser import SafeConfigParser
40 from base64 import b64encode
46 from HTMLParser import HTMLParser
58 class HTML2Text(HTMLParser):
61 self.inheadingone = False
62 self.inheadingtwo = False
63 self.inotherheading = False
67 HTMLParser.__init__(self)
69 def handle_starttag(self, tag, attrs):
70 if tag.lower() == "h1":
71 self.inheadingone = True
72 elif tag.lower() == "h2":
73 self.inheadingtwo = True
74 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
75 self.inotherheading = True
76 elif tag.lower() == "a":
78 elif tag.lower() == "br":
79 self.text = self.text + "\n"
80 elif tag.lower() == "p":
82 self.text = self.text + "\n\n"
84 def handle_startendtag(self, tag, attrs):
85 if tag.lower() == "br":
86 self.text = self.text + "\n"
88 def handle_endtag(self, tag):
89 if tag.lower() == "h1":
90 self.inheadingone = False
91 self.text = self.text + self.headingtext + "\n" + "=" * len(self.headingtext)
93 elif tag.lower() == "h2":
94 self.inheadingtwo = False
95 self.text = self.text + self.headingtext + "\n" + "-" * len(self.headingtext)
97 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
98 self.inotherheading = False
99 self.text = self.text + self.headingtext + "\n" + "~" * len(self.headingtext)
100 self.headingtext = ""
102 def handle_data(self, data):
103 if not self.inheadingone and not self.inheadingtwo and not self.inotherheading:
104 self.text = self.text + data.strip() + " "
106 self.headingtext = self.headingtext + data.strip() + " "
108 def handle_entityref(self, name):
109 if entities.has_key(name.lower()):
110 self.text = self.text + entities[name.lower()]
112 self.text = self.text + "&" + name + ";"
117 def parse_and_deliver(maildir, url, statedir):
118 md = mailbox.Maildir(maildir)
119 fp = feedparser.parse(url)
120 db = dbm.open(os.path.join(statedir, "seen"), "c")
121 for item in fp["items"]:
122 # have we seen it before?
123 # need to work out what the content is first...
125 if item.has_key("content"):
126 content = item["content"][0]["value"]
128 content = item["summary"]
130 md5sum = md5.md5(content.encode("utf8")).hexdigest()
132 if db.has_key(item["link"]):
133 data = db[item["link"]]
134 data = cgi.parse_qs(data)
135 if data["contentmd5"][0] == md5sum:
139 author = item["author"]
143 # create a basic email message
144 msg = MIMEMultipart("alternative")
145 messageid = "<" + datetime.datetime.now().strftime("%Y%m%d%H%M") + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,6)]) + "@" + socket.gethostname() + ">"
146 msg.add_header("Message-ID", messageid)
147 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
148 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
149 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
150 createddate = datetime.datetime(*item["updated_parsed"][0:6]).strftime("%a, %e %b %Y %T -0000")
151 msg.add_header("Date", createddate)
152 msg.add_header("Subject", item["title"])
153 msg.set_default_type("text/plain")
155 htmlpart = MIMEText(content.encode("utf8"), "html", "utf8")
156 textparser = HTML2Text()
157 textparser.feed(content.encode("utf8"))
158 textcontent = textparser.gettext()
159 textpart = MIMEText(textcontent, "plain", "utf8")
163 # start by working out the filename we should be writting to, we do
164 # this following the normal maildir style rules
165 fname = str(os.getpid()) + "." + socket.gethostname() + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,10)]) + "." + datetime.datetime.now().strftime('%s')
166 fn = os.path.join(maildir, "tmp", fname)
168 fh.write(msg.as_string())
170 # now move it in to the new directory
171 newfn = os.path.join(maildir, "new", fname)
175 # now add to the database about the item
176 data = urllib.urlencode((("message-id", messageid), ("created", createddate), ("contentmd5", md5sum)))
177 db[item["link"]] = data
181 # first off, parse the command line arguments
183 oparser = OptionParser()
185 "-c", "--conf", dest="conf",
186 help="location of config file"
189 "-s", "--statedir", dest="statedir",
190 help="location of directory to store state in"
193 (options, args) = oparser.parse_args()
195 # check for the configfile
199 if options.conf != None:
200 # does the file exist?
202 os.stat(options.conf)
203 configfile = options.conf
205 # should exit here as the specified file doesn't exist
206 sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
209 # check through the default locations
211 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
212 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
215 os.stat("/etc/rss2maildir.conf")
216 configfile = "/etc/rss2maildir.conf"
218 sys.stderr.write("No config file found. Exiting.\n")
221 # Right - if we've got this far, we've got a config file, now for the hard
224 scp = SafeConfigParser()
227 maildir_root = "RSSMaildir"
230 if options.statedir != None:
231 state_dir = options.statedir
233 mode = os.stat(state_dir)[stat.ST_MODE]
234 if not stat.S_ISDIR(mode):
235 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
238 # try to make the directory
242 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
244 elif scp.has_option("general", "state_dir"):
245 new_state_dir = scp.get("general", "state_dir")
247 mode = os.stat(state_dir)[stat.ST_MODE]
248 if not stat.S_ISDIR(mode):
249 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
254 os.mkdir(new_state_dir)
255 state_dir = new_state_dir
257 sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
261 mode = os.stat(state_dir)[stat.ST_MODE]
262 if not stat.S_ISDIR(mode):
263 sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
269 sys.stderr.write("State directory %s could not be created\n" %(state_dir))
272 if scp.has_option("general", "maildir_root"):
273 maildir_root = scp.get("general", "maildir_root")
276 mode = os.stat(maildir_root)[stat.ST_MODE]
277 if not stat.S_ISDIR(mode):
278 sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
282 os.mkdir(maildir_root)
284 sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
287 feeds = scp.sections()
289 feeds.remove("general")
293 for section in feeds:
294 # check if the directory exists
297 maildir = scp.get(section, "maildir")
301 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
302 maildir = os.path.join(maildir_root, maildir)
305 exists = os.stat(maildir)
306 if stat.S_ISDIR(exists[stat.ST_MODE]):
307 # check if there's a new, cur and tmp directory
309 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
311 os.mkdir(os.path.join(maildir, "cur"))
312 if not stat.S_ISDIR(mode):
313 sys.stderr.write("Broken maildir: %s\n" %(maildir))
315 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
317 os.mkdir(os.path.join(maildir, "tmp"))
318 if not stat.S_ISDIR(mode):
319 sys.stderr.write("Broken maildir: %s\n" %(maildir))
321 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
322 if not stat.S_ISDIR(mode):
323 sys.stderr.write("Broken maildir: %s\n" %(maildir))
325 os.mkdir(os.path.join(maildir, "new"))
327 sys.stderr.write("Broken maildir: %s\n" %(maildir))
332 sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
335 os.mkdir(os.path.join(maildir, "new"))
336 os.mkdir(os.path.join(maildir, "cur"))
337 os.mkdir(os.path.join(maildir, "tmp"))
339 sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
342 # right - we've got the directories, we've got the section, we know the
345 parse_and_deliver(maildir, section, state_dir)