cd ~/life && git log | head
I'm the owner of a Kindle. This little (non free :/) piece of hardware has quite changed my life. I now want to read everything on it. Especially long web articles. So, I've written a small script that extract the core information of a web page then transform the result into a mobipocket file.
This script combine two tools: decruft (a python implementation of the readability core algorithm) and ebook-convert, a tool supplied by calibre (it's in the pkg "calibre-bin"). I glued everything with some dirty bash, and voilà.
The result code is quite horrible (and buggy in some places) but is good enough for my needs. Despise this dirty aspect I release it anyway, maybe it will be useful for someone or, even better, maybe someone will code a better version.
#!/bin/bash
mobify()
{
# decruft don't create correct headers :(
# also grab the title of the webpage using python
echo "<html><head><title>$(python -c "from mechanize import Browser;
b = Browser(); b.open('$1'); print b.title()")</title></head></body>" > .tmp.html
decruft.py -u $1 >> .tmp.html
echo "</body></html>" >> .tmp.html
# remove some useless stuff that decruft don't remove
sed -i 's/<p [^>]\\+>/<p>/g' .tmp.html
# download images to includes them in the mobi file (ebook-convert do that automatically)
# (I'm pretty sure wget can do this in a much better way)
# for every images
for i in $(grep "<img" .tmp.html | sed 's/.*src="\\([^"]\\+\\)".*/\\1/g' | sort | uniq);
do
# if I'm a direct url
if [ $(python -c "print '$i'.startswith('http://')") == "True" ]
then
wget "$i"
else
# add the beginning of the website name to create a complete url
# I'm pretty sure this part is buggy, the $() probably inject caracters interpreted by sed
wget $(echo "$i" | sed "s/^/$(echo "$1" | sed 's#http://##' | sed 's#/*##' )/")
fi
done
# change the url in the <img src="" /> to point it to the current folder
sed -i 's/src="[^"]*\\//src="/g' .tmp.html
ebook-convert .tmp.html mobi/$2.mobi
}
# arg parsing
if [ $1 ];
then
if [ $2 ];
then
if [ -e mobi/$1.mobi ];
then
if [ $3 ]
then
if [ $3 == "force" ]
then
mobify $1 $2
else
echo 'add "force" as last args'
fi
else
echo 'add "force" as last args'
fi
else
mobify $1 $2
fi
else
echo "Usage: <url> <filename (without extension)> [force]"
fi
fi