4 # rss2maildir.py - RSS feeds to Maildir 1 email per item
5 # Copyright (C) 2007 Brett Parker <iDunno@sommitrealweird.co.uk>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28 from email.MIMEMultipart import MIMEMultipart
29 from email.MIMEText import MIMEText
38 from optparse import OptionParser
39 from ConfigParser import SafeConfigParser
41 from base64 import b64encode
47 from HTMLParser import HTMLParser
60 class HTML2Text(HTMLParser):
63 self.inheadingone = False
64 self.inheadingtwo = False
65 self.inotherheading = False
66 self.inparagraph = True
69 self.currentparagraph = ""
71 HTMLParser.__init__(self)
73 def handle_starttag(self, tag, attrs):
74 if tag.lower() == "h1":
75 self.inheadingone = True
76 self.inparagraph = False
77 elif tag.lower() == "h2":
78 self.inheadingtwo = True
79 self.inparagraph = False
80 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
81 self.inotherheading = True
82 self.inparagraph = False
83 elif tag.lower() == "a":
85 elif tag.lower() == "br":
86 self.text = self.text + "\n"
87 elif tag.lower() == "p":
89 self.text = self.text + "\n\n"
90 self.currentparagraph = ""
91 self.inparagraph = True
93 def handle_startendtag(self, tag, attrs):
94 if tag.lower() == "br":
95 self.text = self.text + "\n"
97 def handle_endtag(self, tag):
98 if tag.lower() == "h1":
99 self.inheadingone = False
100 self.text = self.text + self.headingtext + "\n" + "=" * len(self.headingtext)
101 self.headingtext = ""
102 elif tag.lower() == "h2":
103 self.inheadingtwo = False
104 self.text = self.text + self.headingtext + "\n" + "-" * len(self.headingtext)
105 self.headingtext = ""
106 elif tag.lower() in ["h3", "h4", "h5", "h6"]:
107 self.inotherheading = False
108 self.text = self.text + self.headingtext + "\n" + "~" * len(self.headingtext)
109 self.headingtext = ""
110 elif tag.lower() == "p":
111 self.text = self.text + "\n".join(textwrap.wrap(self.currentparagraph, 70))
112 self.inparagraph = False
114 def handle_data(self, data):
115 if not self.inheadingone and not self.inheadingtwo and not self.inotherheading and not self.inparagraph:
116 self.text = self.text + data.strip() + " "
117 elif self.inparagraph:
118 self.currentparagraph = self.currentparagraph + data.strip() + " "
120 self.headingtext = self.headingtext + data.strip() + " "
122 def handle_entityref(self, name):
123 if entities.has_key(name.lower()):
124 self.text = self.text + entities[name.lower()]
126 self.text = self.text + "&" + name + ";"
131 data = data + "\n".join(textwrap.wrap(self.currentparagraph, 70))
134 def parse_and_deliver(maildir, url, statedir):
135 md = mailbox.Maildir(maildir)
136 fp = feedparser.parse(url)
137 db = dbm.open(os.path.join(statedir, "seen"), "c")
138 for item in fp["items"]:
139 # have we seen it before?
140 # need to work out what the content is first...
142 if item.has_key("content"):
143 content = item["content"][0]["value"]
145 content = item["summary"]
147 md5sum = md5.md5(content.encode("utf8")).hexdigest()
149 if db.has_key(item["link"]):
150 data = db[item["link"]]
151 data = cgi.parse_qs(data)
152 if data["contentmd5"][0] == md5sum:
156 author = item["author"]
160 # create a basic email message
161 msg = MIMEMultipart("alternative")
162 messageid = "<" + datetime.datetime.now().strftime("%Y%m%d%H%M") + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,6)]) + "@" + socket.gethostname() + ">"
163 msg.add_header("Message-ID", messageid)
164 msg.set_unixfrom("\"%s\" <rss2maildir@localhost>" %(url))
165 msg.add_header("From", "\"%s\" <rss2maildir@localhost>" %(author))
166 msg.add_header("To", "\"%s\" <rss2maildir@localhost>" %(url))
167 createddate = datetime.datetime(*item["updated_parsed"][0:6]).strftime("%a, %e %b %Y %T -0000")
168 msg.add_header("Date", createddate)
169 msg.add_header("Subject", item["title"])
170 msg.set_default_type("text/plain")
172 htmlpart = MIMEText(content.encode("utf8"), "html", "utf8")
173 textparser = HTML2Text()
174 textparser.feed(content.encode("utf8"))
175 textcontent = textparser.gettext()
176 textpart = MIMEText(textcontent, "plain", "utf8")
180 # start by working out the filename we should be writting to, we do
181 # this following the normal maildir style rules
182 fname = str(os.getpid()) + "." + socket.gethostname() + "." + "".join([random.choice(string.ascii_letters + string.digits) for a in range(0,10)]) + "." + datetime.datetime.now().strftime('%s')
183 fn = os.path.join(maildir, "tmp", fname)
185 fh.write(msg.as_string())
187 # now move it in to the new directory
188 newfn = os.path.join(maildir, "new", fname)
192 # now add to the database about the item
193 data = urllib.urlencode((("message-id", messageid), ("created", createddate), ("contentmd5", md5sum)))
194 db[item["link"]] = data
198 # first off, parse the command line arguments
200 oparser = OptionParser()
202 "-c", "--conf", dest="conf",
203 help="location of config file"
206 "-s", "--statedir", dest="statedir",
207 help="location of directory to store state in"
210 (options, args) = oparser.parse_args()
212 # check for the configfile
216 if options.conf != None:
217 # does the file exist?
219 os.stat(options.conf)
220 configfile = options.conf
222 # should exit here as the specified file doesn't exist
223 sys.stderr.write("Config file %s does not exist. Exiting.\n" %(options.conf,))
226 # check through the default locations
228 os.stat("%s/.rss2maildir.conf" %(os.environ["HOME"],))
229 configfile = "%s/.rss2maildir.conf" %(os.environ["HOME"],)
232 os.stat("/etc/rss2maildir.conf")
233 configfile = "/etc/rss2maildir.conf"
235 sys.stderr.write("No config file found. Exiting.\n")
238 # Right - if we've got this far, we've got a config file, now for the hard
241 scp = SafeConfigParser()
244 maildir_root = "RSSMaildir"
247 if options.statedir != None:
248 state_dir = options.statedir
250 mode = os.stat(state_dir)[stat.ST_MODE]
251 if not stat.S_ISDIR(mode):
252 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
255 # try to make the directory
259 sys.stderr.write("Couldn't create statedir %s" %(state_dir))
261 elif scp.has_option("general", "state_dir"):
262 new_state_dir = scp.get("general", "state_dir")
264 mode = os.stat(state_dir)[stat.ST_MODE]
265 if not stat.S_ISDIR(mode):
266 sys.stderr.write("State directory (%s) is not a directory\n" %(state_dir))
271 os.mkdir(new_state_dir)
272 state_dir = new_state_dir
274 sys.stderr.write("Couldn't create state directory %s\n" %(new_state_dir))
278 mode = os.stat(state_dir)[stat.ST_MODE]
279 if not stat.S_ISDIR(mode):
280 sys.stderr.write("State directory %s is not a directory\n" %(state_dir))
286 sys.stderr.write("State directory %s could not be created\n" %(state_dir))
289 if scp.has_option("general", "maildir_root"):
290 maildir_root = scp.get("general", "maildir_root")
293 mode = os.stat(maildir_root)[stat.ST_MODE]
294 if not stat.S_ISDIR(mode):
295 sys.stderr.write("Maildir Root %s is not a directory\n" %(maildir_root))
299 os.mkdir(maildir_root)
301 sys.stderr.write("Couldn't create Maildir Root %s\n" %(maildir_root))
304 feeds = scp.sections()
306 feeds.remove("general")
310 for section in feeds:
311 # check if the directory exists
314 maildir = scp.get(section, "maildir")
318 maildir = urllib.urlencode(((section, maildir),)).split("=")[1]
319 maildir = os.path.join(maildir_root, maildir)
322 exists = os.stat(maildir)
323 if stat.S_ISDIR(exists[stat.ST_MODE]):
324 # check if there's a new, cur and tmp directory
326 mode = os.stat(os.path.join(maildir, "cur"))[stat.ST_MODE]
328 os.mkdir(os.path.join(maildir, "cur"))
329 if not stat.S_ISDIR(mode):
330 sys.stderr.write("Broken maildir: %s\n" %(maildir))
332 mode = os.stat(os.path.join(maildir, "tmp"))[stat.ST_MODE]
334 os.mkdir(os.path.join(maildir, "tmp"))
335 if not stat.S_ISDIR(mode):
336 sys.stderr.write("Broken maildir: %s\n" %(maildir))
338 mode = os.stat(os.path.join(maildir, "new"))[stat.ST_MODE]
339 if not stat.S_ISDIR(mode):
340 sys.stderr.write("Broken maildir: %s\n" %(maildir))
342 os.mkdir(os.path.join(maildir, "new"))
344 sys.stderr.write("Broken maildir: %s\n" %(maildir))
349 sys.stderr.write("Couldn't create root maildir %s\n" %(maildir))
352 os.mkdir(os.path.join(maildir, "new"))
353 os.mkdir(os.path.join(maildir, "cur"))
354 os.mkdir(os.path.join(maildir, "tmp"))
356 sys.stderr.write("Couldn't create required maildir directories for %s\n" %(section,))
359 # right - we've got the directories, we've got the section, we know the
362 parse_and_deliver(maildir, section, state_dir)