# # mhwc - Manfred Haertel's Web Client # # manfred haertel 18.11.1997 # # # revisions # ========= # vers date modification # -------------------------------------------------------------- # 1.1 25.11.1997 added chop while reading robots.txt to avoid # reading cr's, otherwise matches may fail # # # includes # note that Socket has to be spelled with a large S # use Socket ; # # version # $our_version = "1.0" ; # # get all urls # open ( url_in_handle , "url.dat" ) ; $counter = 0 ; while ( $url_in[$counter] = ) { $counter ++ ; } print "Knowing $counter URL(s) ...\n" ; # # choose one per random # srand ( time () ) ; $random_number = int(rand $counter) ; $our_url = $url_in[$random_number] ; chop ( $our_url ) ; printf ( "Requesting $our_url ...\n\n" ) ; # # parse our own url # # # first case: no filespec, so nodename follows double slash and filespec is / # $number_of_slashes = ( $our_url =~ tr/\//\// ) ; if ( $number_of_slashes == 2 ) { $our_url =~ /\/\// ; $our_node = $' ; $our_filespec = "/" ; $our_dir = "/" ; } else { # # nodename is between double slash and next slash # $our_url =~ /\/\// ; $our_node = $' ; $our_node =~ /\// ; $our_node = $` ; # # file specification follows node name # $our_url =~ /$our_node/ ; $our_filespec = $' ; # # directory follows node and ends at last slash # if only 3 slashes in total, it's simply / # if ( $number_of_slashes == 3 ) { $our_dir = "/" ; } else { $our_dir = $our_filespec ; $rest = $our_filespec ; while ( $rest =~ /\// ) { $rest = $' ; } if ( $rest =~ /^$/ ) { $our_dir = $our_filespec ; } else { $our_dir =~ /$rest/ ; $our_dir = $` ; } } } # # check for robot exclusion # $robots_allowed = check_robot ( $our_node , $our_dir ) ; if ( ! $robots_allowed ) { print "Sorry, no robot access allowed.\n" ; exit ; } # # open socket # open_socket ( socket_handle , $our_node ) ; # # ask for information # print "GET $our_filespec HTTP/1.0\n" ; print socket_handle "GET $our_filespec HTTP/1.0\n" ; print socket_handle "User-agent: MHWC V$our_version - Manfred Haertel's Web Client\n" ; print socket_handle "\n" ; # # skip everything until first empty line # while ( ( $line = ) !~ /^\r/ ) { print $line ; } # # get unique name for output file # $html_filename = time () ; # # create and open output files # note that we have to change the html file to binary mode # because otherwise perl replaces lf with cr/lf and this is # bad for jpeg files or executables # open ( html_handle , ">$html_filename.html" ) ; binmode ( html_handle ) ; open ( url_handle , ">>url.dat" ) ; # # get a line until eof # get_line: while ( $line = ) { # # save line in output file # print html_handle $line ; # # check for hypertext reference # could be lower or upper case # if ( ( $line =~ /href=/ ) || ( $line =~ /HREF=/ ) ) { # # get every thing behind the search string # $url = $' ; # # now match to the closing bracket and get everything before it # $url =~ />/ ; $url = $` ; # # take away quotation marks (if there) # $url =~ s/"// ; $url =~ s/"// ; # # if a link contains no protocol we assume http # $url =~ /\// ; $before_slash = $` ; if ( $before_slash !~ /:/ ) { $url = "http:$url" ; } # # we're only interested in http references # if ( $url !~ /^http:/ ) { next get_line ; } # # if http: is not followed by a slash, then we have # just a filename or a relative directory # so we insert our own # if ( $url !~ /http:\// ) { $url =~ /http:/ ; $url = "http:$our_dir$'" ; } # # if http: ist not followed by a double slash, then we # have no nodename. # again we insert our own # if ( $url !~ /http:\/\// ) { $url =~ /http:\// ; $url = "http://$our_node/$'" ; } # # save it # print "$url\n" ; print url_handle "$url\n" ; } } # # close outfiles # close ( html_handle ) ; close ( url_handle ) ; # # close socket # close ( socket_handle ) ; # # bye # exit ; # # subroutine for opening a socket connection # sub open_socket { # # get parameters # ( $this_handle , $this_node ) = @_ ; # # does the node name contain a port number? # if yes, we find it after a colon # if no, the port number is assumed to be 80 # if ( $this_node =~ /:/ ) { $port = $' ; $this_node = $` ; } else { $port = 80 ; } # # open connection # $protocol = getprotobyname ( 'tcp' ) ; socket ( $this_handle , PF_INET , SOCK_STREAM , $protocol ) ; $destination = sockaddr_in ( $port , inet_aton ( $this_node ) ) ; connect ( $this_handle , $destination ) or die "Could not connect to $this_node!\n" ; # # don't buffer # $temp_handle = select ( $this_handle ) ; $| = 1 ; $this_handle = select ( $temp_handle ) ; } # # subroutine for checking robot exclusion # sub check_robot { # # get parameters # ( $this_node , $this_dir ) = @_ ; $check_dir = $this_dir ; $check_dir =~ tr/A-Z/a-z/ ; # # open socket # open_socket ( socket_handle , $our_node ) ; # # ask for information # we also identify ourselves # print socket_handle "GET /robots.txt HTTP/1.0\n" ; print socket_handle "User-agent: MHWC V$our_version - Manfred Haertel's Web Client\n" ; print socket_handle "\n" ; # # first check status code. be conservative: 404 (file not found) # means, we may, 200 (ok) means, we have to check, everything else # means: we may not # $line = ; print $line ; if ( $line =~ /404/ ) { close ( socket_handle ) ; return 1 ; } if ( $line !~ /200/ ) { close ( socket_handle ) ; return 0 ; } # # now check all the lines # rules: first match is valid, no match means: allowed # note that comparison should be case insensitive # # # first read all lines into an array (in lowercase) # $counter = 0 ; while ( $next_line = ) { chop ( $next_line ) ; print "$next_line\n" ; $next_line =~ tr/A-Z/a-z/ ; $line[$counter] = $next_line ; $counter ++ ; } $last_line = $counter - 1 ; # # check for a line with "user-agent: *" # $agent_line = - 1 ; for ( $counter = 0 ; $counter <= $last_line ; $counter ++ ) { if ( $line[$counter] =~ /user-agent: */ ) { $agent_line = $counter ; } } # # now check for a line with "user-agent: mhwc", it has higher prio # for ( $counter = 0 ; $counter <= $last_line ; $counter ++ ) { if ( $line[$counter] =~ /user-agent: mhwc/ ) { $agent_line = $counter ; } } # # if no such lines were found, access is allowed # if ( $agent_line == -1 ) { close ( socket_handle ) ; return 1 ; } # # otherwise check the records AFTER the user-agent line # $counter = $agent_line + 1 ; $match_found = 0 ; while ( ( $counter <= $last_line ) && ( ! $match_found ) ) { $match_string = $line[$counter] ; # # directory starts at first slash # it stops at tab, blank or return # $match_string =~ /\// ; $match_string = "/$'" ; if ( $match_string =~ /\t/ ) { $match_string = $` ; } if ( $match_string =~ / / ) { $match_string = $` ; } if ( $match_string =~ /\r/ ) { $match_string = $` ; } if ( $check_dir =~ /$match_string/ ) { $match_found = 1 ; $matching_line = $line[$counter] ; } $counter ++ ; } # # no match found: access allowed # if ( ! $match_found ) { close ( socket_handle ) ; return 1 ; } # # check if disallow or allow # if ( $matching_line =~ /disallow/ ) { close ( socket_handle ) ; return 0 ; } else { close ( socket_handle ) ; return 1 ; } }