Listing of clean_web_referrers
#!/bin/csh
umask 22
cd ~/public_html
chmod 700 .
# wait for any accesses to finish up.
sleep 10
foreach d (`find . -type d -name .referrers -print`)
foreach f (`find $d -type f -name \*.html -print` )
set pagedir=$d
set pagename=`echo $f|sed -e 's/^.*\///'`;
pushd .>/dev/null;
cd $pagedir;
cp $pagename ${pagename}.bak
#
# Don't bother trying to put all these into 1 RE. egrep cannot take
# a RE that big. Even if the RE is in a file (using the -f option.
# Don't try to use multiple -e's either. Egrep only looks at the
# last one.
#
cat ${pagename}.bak \
| egrep -ive '<li><a href="http://(www.pads)(\.uwaterloo\.ca)?(:[0-9]*)?/(~|%7e)mjfrazer/' \
| egrep -ive '<li><a href="http://(dictator)(\.uwaterloo\.ca)?(:[0-9]*)?/(~|%7e)mjfrazer/' \
| egrep -ive '<li><a href="http://se(.math)?(\.uwaterloo\.ca)?(:[0-9]*)?/(~|%7e)mjfrazer/' \
| egrep -ive '<li><a href="http://(coulomb|sunee|www\.ece)(\.uwaterloo\.ca)?/(~|%7e)mjfrazer/' \
| egrep -ive '<li><a href="http://(www.pads|dictator)(:[0-9]*)?/' \
| egrep -ive '<li><a href="(file|news):' \
| egrep -ive '<li><a href="http://www.alta-vista.com/' \
| egrep -ive '<li><a href="http://[a-zA-z0-9\._-]*altavista[a-zA-z0-9\._-]+/' \
| egrep -ive '<li><a href="http://[a-zA-z0-9\._-]*webcrawler[a-zA-z0-9\._-]+/' \
| egrep -ive '<li><a href="http://[a-zA-z0-9\._-]*lycos[a-zA-z0-9\._-]+/' \
| egrep -ive '<li><a href="http://.*metacrawler.com/' \
| egrep -ive '<li><a href="http://.*search.com/' \
| egrep -ive '<li><a href="http://.*digiweb.com/' \
| egrep -ive '<li><a href=".*(search\.opentext|infoseek)\.com' \
| egrep -ive '<li><a href=".*(www\.excite\.com/search\.gw)' \
| egrep -ive '<li><a href=".*OTI_Robot\.html' \
| egrep -ive '<li><a href="http://www.uwaterloo.ca/UWaterlooSpider.html">' \
| egrep -ive '<li><a href="http://www.otspider.uwaterloo.ca/' \
| egrep -ive '<li><a href="http://metacrawler.cs.washington.edu' \
| egrep -ive '<li><a href="http://(av|search)\.yahoo\.com' \
| egrep -ive '<li><a href="http://www.yk.rim.or.jp/' \
| egrep -ive '<li><a href="http://www.hotbot.com/' \
| egrep -ive '<li><a href="http://search.dogpile.com/' \
| egrep -ive '<li><a href="http://.*.dejanews.com/' \
| egrep -ive '<li><a href="http://.*.inference.com/' \
> $pagename
rm ${pagename}.bak
popd >/dev/null
end
end
chmod 755 .
Mark Frazer -- mjfrazer@gmail.com