Beruflich Dokumente
Kultur Dokumente
/bin/gawk -f
BEGIN{IGNORECASE=1 ##Ignore case as some websites may have HREF or href or HrEf
and html isn't case sensitive.
##Checks the number of files being passed as arguements
##If more then one arguement then delete all apart from the first and promt the
user.
else if(others[path]){others[path]+=1}
else{others[path] = 1}
}
##Function
function linkFound(path){
if(path ~ /^href/){webpageLinkFound(path)}
if(path ~ /^src/){otherLinkFound(path)}
}
##Function that crops the string so that only the path inside the quotes is left.
function stripQuotes(foundMatch){
match(foundMatch, /"/);
foundMatch = substr(foundMatch, RSTART + 1);
match(foundMatch, /"/);
path = substr(foundMatch, 0, RSTART - 1);
return path
}
##Matching section.
{
##If the previous found path wasn't finsihed with a " then concatenate the first
field from the next lol
if(carriesOver == "true"){foundMatch=(carriedString $1 $2); carriesOver =
"false";linkFound(foundMatch)}
if(NF>0){
##This section concatenates the found string if there are spaces between the href,
= sign and path.
else if($i ~ /^href/ || $i ~ /^src=/)
{incomplete = $i $(i+1); if(incomplete ~ /^href=/)
{incomplete = incomplete $(i+2);if(incomplete ~ /^href=".*"/)
{stripQuotes(incomplete)} else if(incomplete ~ /^href=/ || $i ~ /^src=/)
{carriedString = incomplete; carriesOver="true"}}}
}
}
END{
print "\n Webpage Links found on page:", ARGV[1], "\n"
printf("%-20s%30s\n","URL","Frequancy\n")
for(path in webpages)
{
printf("%-20s%30s\n",path,webpages[path])
}
for(image in images)
{
printf("%-20s%30s\n",image, images[image])
}
for(other in others)
{
printf("%-20s%30s\n",other, others[other])}
}
#####
## My code matches anything in this format href="#anypath#.xhtml
##
## (.*\.+x?html?)
##