Continueity of
web site directories
-
"check" is a wrapper program for "webcheck" which allows recursive use.
- "webcheck" is a Perl script for checking website directories,
and is found [below.]
Check
#!/bin/bash
# $Id: check,v 3.2 2001/06/14 05:11:36 jno Exp $
# /usr/local/bin/check - recursive use of webcheck
# this is a wrapper file, invoke from any directory
find . -type d -exec 'webcheck' {} \;
Webcheck
#!/usr/bin/perl
# $Id: webcheck,v 3.28 2002/11/30 20:50:54 jno Exp $
# /usr/local/bin/webcheck, Usage: webcheck {directory}
# defaults to $ENV{LOGNAME} with no commandline parameters
# DESCRIPTION: A html linting utility written in Perl, which
# checks all internal anchor, img, and background links,
# and can be used recursively. See notes below program.
########## NOTE: Make a selection of file extension you wish this
########## program to check for. Include images.
########## NOTE: you must list 'html' before 'htm'
$extensions = "html|htm|txt|jpg|gif|zip|wav";
$currentdate = (`date`);
$currentdir = $ARGV[0] ;
# set cwd if argv[0] is not set
($ARGV[0]) or die "must specify directory\n";
chdir "$currentdir" ;
print " directory: $currentdir\n" ;
########## NOTE: add or delete the "htm", "html", or "shtml" file
########## extensions as needed in the line below. See notes below.
foreach $filename (`ls *.htm *.html *.php`) # 1- each file
{
$all = () ; # reset the blurch
undef $/; # K: undef eol
open (FILE, "<$filename");
print "$filename";
$all = join ("",<FILE>); # all of file slurped up
close FILE;
at code =(); # find <..> segms and slurp
while ($all =~ /<[^>]+>/gim) {push ( at code, $&)}
$links=(); # clear links collection
foreach $code ( at code)
{ # 2- each segment inspected
# start not-any (" = blank),
# repeat, follow with period,
# end with known extension.
while ($code=~ /[^(\"|=| )]+\.($extensions)/gims)
{ # 3-
if ($& =~ /:\/\//) { next } # skip http files
########## NOTE: Hash out the following line to speed up WebCheck,
########## and see notes under "Orphans" below.
elsif (-e $&) { system (touch, $&)} # touch if exists
else { $links .= " -- $&\n" unless -e $&}; # list if not exist
} #-3
} #-2
if ($links) { $missing .= "$filename$links\n" } # assoc w filename
} #-1
######### Send email locally (to owner) -- see notes
######### NOTE: if sendmail delays delivery, use procmail instead.
######### Both forms are shown below.
######### Or run /usr/sbin/sendmail -q as root
if ($missing) {
open (MAIL, "|/usr/sbin/sendmail -oi -n -t");
# open (MAIL, "|/usr/bin/procmail -Y");
print MAIL <<EOF;
To:$ENV{LOGNAME}
From:WEBCHECK
Subject:$currentdir
"WebCheck" searches all *.htm and *.html files in the logged directory
for word chunks ending in the following file extensions...
$extensions
The current directory and listed paths of a link are inspected for the
existance of files. Fully qualified URLs and name anchors are skipped.
Today's date.. $currentdate
This check was made from .. $currentdir
Missing links are listed by source filename below...
\n$missing
(end)
EOF
print "====== ERRORS reported via email ======\n\n" ;
}
else { print " == no email report ==\n\n" };
#
# SETUP:
#
# - make note of perl and sendmail (or procmail) location, and the
# To: header, and make corrections as needed. The "To:" is currently
# set to $ENV{LOGNAME}. If email notification is to be send elsewhere,
# change "To:$ENV{LOGNAME}" to another email address.
# Be sure to escape the " at " as "\ at "
#
# - See notes in the body of the program concerning appropriate use
# of sendmail or procmail.
#
# - Set the file extensions to be looked for at the variable
#
# $extensions="aaa|bbb|ccc";
#
# be sure (1) the right side is enclosed in quotes followed with ;
# (2) the extensions are separated with the | sign.
# The variable $extensions may be found at the top of the program.
#
# - if "*.html" files are not used, delete this from the line of code
#
# foreach $filename (`ls *.htm *.html`)
#
# "ls" will write a "file not found" message to the screen if there
# are no "html" file extensions, yet this is included in the list.
# Similarly other forms such as "php" can be added in this line.
#
# USAGE:
#
# WebCheck searches all *.htm and *.html files (or other file extensions
# as specified) in the current directory for "word chunks" ending in
# common file extensions included within HTML tags. The current directory,
# and any directory included as part of a filename, are inspected for the
# existance of file names derived from these "link-like" word chunks.
#
#
# The user account is notified by email of missing files. A separate
# e-mail will be sent for each directory where missing file names were
# discovered. The missing files are listed by the name of the htm (or
# html) file where these are called.
#
# Note that _all_ of the files will be inspected, including orphans.
# Thus if you receive strange messages about some files, suspect that
# they may be files requested as links from abondened html files.
#
# Webcheck can determine orphaned files, that is, files to which no links
# exist, because all inspected files are touched. Orphans thus show up as
# files with earlier dates ("ls -tl" will list and group by dates).
#
# Note that all orphans will not be identified unless WebCheck has been
# executed in each subdirectory. See notes on a recursive wrapper, below.
#
# NOTE: touching files is very time consuming, since the process is
# repeated at every instance a file is encountered. To VOID the
# ability to identify orphaned files, comment out the line..
#
# elsif (-e $&) { system (touch, $&)}
#
# To have webcheck operate recursively through a file system, execute
# the following (this wrapper file is available as "check")..
#
# find . -type d -exec './webcheck' {} \;
#
# (exactly as it appears above) from some starting point in the directory
# system. This assumes webcheck can be found on the path (as for example,
# in /usr/local/bin) or that a copy of webcheck is found in the root
# directory where file checking is started.
#
# Webcheck operates verbosely, listing all the filenames which are
# inspected. to operate silently, hash the lines..
# print "$currentdir\n" ;
# print "$filename";
# print " === ERRORS reported via email ===\n\n" ;
# else { print " === no email report ===\n\n" };
#
# Note that files linked from orphaned files will show as active files
# until the orphans are removed. See "about orphans" above.
#
# WebCheck will catch _any_ word-like link file names (with names of any
# size, including path names), including any nonalphabetical characters
# except the double quote, equal sign, parenthesis, and included blanks.
#
# BUGS AND CAVEATS:
#
# - WebCheck lists missing links as often as they occur in a html file.
# - All anchors of the form "href=http://... etc" are ignored.
# - Name anchors links of the form "file.htm#goto" are stripped of
# the information after the # mark before testing.
#
# COPYRIGHT NOTICES:
#
# Copyright (C) 1998 2001 Kees Cook, Counterpoint Networking, Inc.
# cook (at) outflux (dot) net
# Developmental design: Jno Cook, Aesthetic Investigation, Chicago
# jno (at) blight (dot) com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation, version 2.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# http://www.gnu.org/copyleft/gpl.html
#
Website Provider: Outflux.net, www.Outflux.net
URL:http://jnocook.net/geek/webcheck.htm