Listing of clean_web_referrers


#!/bin/csh

umask 22
cd ~/public_html
chmod 700 .
# wait for any accesses to finish up.
sleep 10
foreach d (`find . -type d -name .referrers -print`)
    foreach f (`find $d -type f -name \*.html -print` )
	set pagedir=$d
	set pagename=`echo $f|sed -e 's/^.*\///'`;
	pushd .>/dev/null;
	cd $pagedir;
	cp $pagename ${pagename}.bak
	#
	# Don't bother trying to put all these into 1 RE.  egrep cannot take
	# a RE that big.  Even if the RE is in a file (using the -f option.
	# Don't try to use multiple -e's either.  Egrep only looks at the
	# last one.
	#
	cat ${pagename}.bak \
	    | egrep -ive '<li><a href="http://(www.pads)(\.uwaterloo\.ca)?(:[0-9]*)?/(~|%7e)mjfrazer/' \
	    | egrep -ive '<li><a href="http://(dictator)(\.uwaterloo\.ca)?(:[0-9]*)?/(~|%7e)mjfrazer/' \
	    | egrep -ive '<li><a href="http://se(.math)?(\.uwaterloo\.ca)?(:[0-9]*)?/(~|%7e)mjfrazer/' \
	    | egrep -ive '<li><a href="http://(coulomb|sunee|www\.ece)(\.uwaterloo\.ca)?/(~|%7e)mjfrazer/' \
	    | egrep -ive '<li><a href="http://(www.pads|dictator)(:[0-9]*)?/' \
	    | egrep -ive '<li><a href="(file|news):' \
	    | egrep -ive '<li><a href="http://www.alta-vista.com/' \
	    | egrep -ive '<li><a href="http://[a-zA-z0-9\._-]*altavista[a-zA-z0-9\._-]+/' \
	    | egrep -ive '<li><a href="http://[a-zA-z0-9\._-]*webcrawler[a-zA-z0-9\._-]+/' \
	    | egrep -ive '<li><a href="http://[a-zA-z0-9\._-]*lycos[a-zA-z0-9\._-]+/' \
	    | egrep -ive '<li><a href="http://.*metacrawler.com/' \
	    | egrep -ive '<li><a href="http://.*search.com/' \
	    | egrep -ive '<li><a href="http://.*digiweb.com/' \
	    | egrep -ive '<li><a href=".*(search\.opentext|infoseek)\.com' \
	    | egrep -ive '<li><a href=".*(www\.excite\.com/search\.gw)' \
	    | egrep -ive '<li><a href=".*OTI_Robot\.html' \
	    | egrep -ive '<li><a href="http://www.uwaterloo.ca/UWaterlooSpider.html">' \
	    | egrep -ive '<li><a href="http://www.otspider.uwaterloo.ca/' \
	    | egrep -ive '<li><a href="http://metacrawler.cs.washington.edu' \
	    | egrep -ive '<li><a href="http://(av|search)\.yahoo\.com' \
	    | egrep -ive '<li><a href="http://www.yk.rim.or.jp/' \
	    | egrep -ive '<li><a href="http://www.hotbot.com/' \
	    | egrep -ive '<li><a href="http://search.dogpile.com/' \
	    | egrep -ive '<li><a href="http://.*.dejanews.com/' \
	    | egrep -ive '<li><a href="http://.*.inference.com/' \
	    > $pagename
	rm ${pagename}.bak
	popd >/dev/null
    end
end
chmod 755 .

 frickin' computers Mark Frazer -- mjfrazer@gmail.com  frickin' computers