Sie sind auf Seite 1von 3

#!

/bin/gawk -f
BEGIN{IGNORECASE=1 ##Ignore case as some websites may have HREF or href or HrEf
and html isn't case sensitive.
##Checks the number of files being passed as arguements
##If more then one arguement then delete all apart from the first and promt the
user.

if(ARGC>2) {print "\nScript only processes one file at a time. \n\nProcessing",


ARGV[1], "\n"; for(i = 2; i<=(ARGC - 1); i++) {delete ARGV[i]}} ##Deletes any
exess arguement from the ARGV array.
else{print "\nProcessing file", ARGV[1],"\n"}
carriesOver ="false"
}

##Function for webpages found denoted by the href tag.


function webpageLinkFound(path){
path = stripQuotes(path)
if(webpages[path]){webpages[path]+=1}
else {webpages[path] = 1}
}

##Function for other links found denoted by src.


function otherLinkFound(path)
{
path = stripQuotes(path); ##Crop down to whats inside the quotes.

if(path ~ /.*\..*/) ##If the path has an extension, denoted by a series of


charectors then a "." and an extension.
{
split(path, myArray, "."); ##Split the path to extract the extension.

if(myArray[2] ~ /gif/ || /jpe?g/ || /png/ || /t?j?iff?/ || /bmp/ || /raw/)


{
if(images[path]){images[path]+=1}
else {images[path] = 1}
}
}

else if(others[path]){others[path]+=1}

else{others[path] = 1}
}
##Function
function linkFound(path){
if(path ~ /^href/){webpageLinkFound(path)}
if(path ~ /^src/){otherLinkFound(path)}
}

##Function that crops the string so that only the path inside the quotes is left.
function stripQuotes(foundMatch){
match(foundMatch, /"/);
foundMatch = substr(foundMatch, RSTART + 1);
match(foundMatch, /"/);
path = substr(foundMatch, 0, RSTART - 1);
return path
}
##Matching section.
{
##If the previous found path wasn't finsihed with a " then concatenate the first
field from the next lol
if(carriesOver == "true"){foundMatch=(carriedString $1 $2); carriesOver =
"false";linkFound(foundMatch)}

if(NF>0){

for (i=1; i<=NF; i++) {if ($i ~ /^href=".*"/ || $i ~ /^src=".*\.+.*"/)


{linkFound($i)}

else if($i ~ /^href=/ || $i ~ /^src=/)


{carriedString = $i; carriesOver="true"}

##This section concatenates the found string if there are spaces between the href,
= sign and path.
else if($i ~ /^href/ || $i ~ /^src=/)
{incomplete = $i $(i+1); if(incomplete ~ /^href=/)
{incomplete = incomplete $(i+2);if(incomplete ~ /^href=".*"/)
{stripQuotes(incomplete)} else if(incomplete ~ /^href=/ || $i ~ /^src=/)
{carriedString = incomplete; carriesOver="true"}}}
}
}

END{
print "\n Webpage Links found on page:", ARGV[1], "\n"

printf("%-20s%30s\n","URL","Frequancy\n")

for(path in webpages)
{
printf("%-20s%30s\n",path,webpages[path])
}

print ("\n\n Images found on page:", ARGV[1], "\n");


printf("%-20s%30s\n","URL","Frequancy\n");

for(image in images)
{
printf("%-20s%30s\n",image, images[image])
}

print ("\n\n Other Links found on page:", ARGV[1], "\n")


printf("%-20s%30s\n","URL","Frequancy\n")

for(other in others)
{
printf("%-20s%30s\n",other, others[other])}
}

#####
## My code matches anything in this format href="#anypath#.xhtml
##
## (.*\.+x?html?)
##

Das könnte Ihnen auch gefallen