#
# mhwc - Manfred Haertel's Web Client
#
# manfred haertel    18.11.1997
#
#
# revisions
# =========
# vers  date         modification
# --------------------------------------------------------------
# 1.1   25.11.1997   added chop while reading robots.txt to avoid
#                    reading cr's, otherwise matches may fail
#

#
# includes
# note that Socket has to be spelled with a large S
#

use Socket ;

#
# version
#

$our_version = "1.0" ;

#
# get all urls
#

open ( url_in_handle , "url.dat" ) ;
$counter = 0 ;

while ( $url_in[$counter] = <url_in_handle> )
{
	$counter ++ ;
}

print "Knowing $counter URL(s) ...\n" ;

#
# choose one per random
#

srand ( time () ) ;
$random_number = int(rand $counter) ;

$our_url = $url_in[$random_number] ;
chop ( $our_url ) ;
printf ( "Requesting $our_url ...\n\n" ) ;

#
# parse our own url
#

#
# first case: no filespec, so nodename follows double slash and filespec is /
#

$number_of_slashes = ( $our_url =~ tr/\//\// ) ;
if ( $number_of_slashes == 2 )
{
	$our_url =~ /\/\// ;
	$our_node = $' ;
	$our_filespec = "/" ;
	$our_dir = "/" ;
}
else
{
	#
	# nodename is between double slash and next slash
	#

	$our_url =~ /\/\// ;
	$our_node = $' ;
	$our_node =~ /\// ;
	$our_node = $` ;

	#
	# file specification follows node name
	#

	$our_url =~ /$our_node/ ;
	$our_filespec = $' ;

	#
	# directory follows node and ends at last slash
	# if only 3 slashes in total, it's simply /
	#

	if ( $number_of_slashes == 3 )
	{
		$our_dir = "/" ;
	}
	else
	{
		$our_dir = $our_filespec ;
		$rest = $our_filespec ;
		while ( $rest =~ /\// )
		{
			$rest = $' ;
		}
		if ( $rest =~ /^$/ )
		{
			$our_dir = $our_filespec ;
		}
		else
		{
			$our_dir =~ /$rest/ ;
			$our_dir = $` ;
		}
	}
}

#
# check for robot exclusion
#

$robots_allowed = check_robot ( $our_node , $our_dir ) ;

if ( ! $robots_allowed )
{
	print "Sorry, no robot access allowed.\n" ;
	exit ;
}

#
# open socket
#

open_socket ( socket_handle , $our_node ) ;

#
# ask for information
#

print "GET $our_filespec HTTP/1.0\n" ;

print socket_handle "GET $our_filespec HTTP/1.0\n" ;
print socket_handle "User-agent: MHWC V$our_version - Manfred Haertel's Web Client\n" ;
print socket_handle "\n" ;

#
# skip everything until first empty line
#

while ( ( $line = <socket_handle> ) !~ /^\r/ )
{
	print $line ;
}

#
# get unique name for output file
#

$html_filename = time () ;

#
# create and open output files
# note that we have to change the html file to binary mode
# because otherwise perl replaces lf with cr/lf and this is
# bad for jpeg files or executables
#

open ( html_handle , ">$html_filename.html" ) ;
binmode ( html_handle ) ;

open ( url_handle , ">>url.dat" ) ;

#
# get a line until eof
#

get_line: while ( $line = <socket_handle> )
{
	#
	# save line in output file
	#

	print html_handle $line ;

	#	
	# check for hypertext reference
	# could be lower or upper case
	#

	if ( ( $line =~ /href=/ ) || ( $line =~ /HREF=/ ) )
	{
		#
		# get every thing behind the search string
		#

		$url = $' ;

		#
		# now match to the closing bracket and get everything before it
		#

		$url =~ />/ ;
		$url = $` ;

		#
		# take away quotation marks (if there)
		#

		$url =~ s/"// ;
		$url =~ s/"// ;

		#
		# if a link contains no protocol we assume http
		#

		$url =~ /\// ;
		$before_slash = $` ;

		if ( $before_slash !~ /:/ )
		{
			$url = "http:$url" ;
		}

		#
		# we're only interested in http references
		#

		if ( $url !~ /^http:/ )
		{
			next get_line ;
		}

		#
		# if http: is not followed by a slash, then we have
		# just a filename or a relative directory
		# so we insert our own
		#

		if ( $url !~ /http:\// )
		{
			$url =~ /http:/ ;
			$url = "http:$our_dir$'" ;
		}

		#
		# if http: ist not followed by a double slash, then we
		# have no nodename.
		# again we insert our own
		#

		if ( $url !~ /http:\/\// )
		{
			$url =~ /http:\// ;
			$url = "http://$our_node/$'" ;
		}

		#
		# save it
		#

		print "$url\n" ;
		print url_handle "$url\n" ;
	}
}

#
# close outfiles
#

close ( html_handle ) ;
close ( url_handle ) ;

#
# close socket
#

close ( socket_handle ) ;

#
# bye
#

exit ;

#
# subroutine for opening a socket connection
#

sub open_socket
{
	#
	# get parameters
	#

	( $this_handle , $this_node ) = @_ ;

	#
	# does the node name contain a port number?
	# if yes, we find it after a colon
	# if no, the port number is assumed to be 80
	#

	if ( $this_node =~ /:/ )
	{
		$port = $' ;
		$this_node = $` ;
	}
	else
	{
		$port = 80 ;
	}		

	#
	# open connection
	#

	$protocol = getprotobyname ( 'tcp' ) ;
	socket ( $this_handle , PF_INET , SOCK_STREAM , $protocol ) ;
	$destination = sockaddr_in ( $port , inet_aton ( $this_node ) ) ;
	connect ( $this_handle , $destination ) 
		or die "Could not connect to $this_node!\n" ;

	#
	# don't buffer
	#

	$temp_handle = select ( $this_handle ) ;
	$| = 1 ;
	$this_handle = select ( $temp_handle ) ;
}

#
# subroutine for checking robot exclusion
#

sub check_robot
{
	#
	# get parameters
	#

	( $this_node , $this_dir ) = @_ ;

	$check_dir = $this_dir ;
	$check_dir =~ tr/A-Z/a-z/ ;

	#
	# open socket
	#

	open_socket ( socket_handle , $our_node ) ;

	#
	# ask for information
	# we also identify ourselves
	#

	print socket_handle "GET /robots.txt HTTP/1.0\n" ;
	print socket_handle "User-agent: MHWC V$our_version - Manfred Haertel's Web Client\n" ;
	print socket_handle "\n" ;

	#
	# first check status code. be conservative: 404 (file not found)
	# means, we may, 200 (ok) means, we have to check, everything else
	# means: we may not
	#

	$line = <socket_handle> ;
	print $line ;

	if ( $line =~ /404/ )
	{
		close ( socket_handle ) ;
		return 1 ;
	}

	if ( $line !~ /200/ )
	{
		close ( socket_handle ) ;
		return 0 ;
	}

	#
	# now check all the lines
	# rules: first match is valid, no match means: allowed
	# note that comparison should be case insensitive
	#

	#
	# first read all lines into an array (in lowercase)
	#

	$counter = 0 ;	

	while ( $next_line = <socket_handle> )
	{
		chop ( $next_line ) ;
		print "$next_line\n" ;
		$next_line =~ tr/A-Z/a-z/ ;
		$line[$counter] = $next_line ;
		$counter ++ ;
	}

	$last_line = $counter - 1 ;

	#
	# check for a line with "user-agent: *"
	#

	$agent_line = - 1 ;

	for ( $counter = 0 ; $counter <= $last_line ; $counter ++ )
	{
		if ( $line[$counter] =~ /user-agent: */ )
		{
			$agent_line = $counter ;
		}
	}

	#
	# now check for a line with "user-agent: mhwc", it has higher prio
	#

	for ( $counter = 0 ; $counter <= $last_line ; $counter ++ )
	{
		if ( $line[$counter] =~ /user-agent: mhwc/ )
		{
			$agent_line = $counter ;
		}
	}

	#
	# if no such lines were found, access is allowed
	#

	if ( $agent_line == -1 )
	{
		close ( socket_handle ) ;
		return 1 ;
	}

	#
	# otherwise check the records AFTER the user-agent line
	#

	$counter = $agent_line + 1 ;
	$match_found = 0 ;

	while ( ( $counter <= $last_line ) && ( ! $match_found ) )
	{
		$match_string = $line[$counter] ;

		#
		# directory starts at first slash
		# it stops at tab, blank or return
		#

		$match_string =~ /\// ;
		$match_string = "/$'" ;

		if ( $match_string =~ /\t/ )
		{
			$match_string = $` ;
		}

		if ( $match_string =~ / / )
		{
			$match_string = $` ;
		}

		if ( $match_string =~ /\r/ )
		{
			$match_string = $` ;
		}

		if ( $check_dir =~ /$match_string/ )
		{
			$match_found = 1 ;
			$matching_line = $line[$counter] ;
		}
		$counter ++ ;
	}

	#
	# no match found: access allowed
	#

	if ( ! $match_found )
	{
		close ( socket_handle ) ;
		return 1 ;
	}

	#
	# check if disallow or allow
	#

	if ( $matching_line =~ /disallow/ )
	{
		close ( socket_handle ) ;
		return 0 ;
	}
	else
	{
		close ( socket_handle ) ;
		return 1 ;
	}
}