0a1257cfcd
always put site cache locally
183 lines
5.1 KiB
Plaintext
183 lines
5.1 KiB
Plaintext
--- makenh.orig Tue Jul 28 03:21:30 1998
|
|
+++ makenh Wed Nov 4 07:05:47 1998
|
|
@@ -68,6 +68,7 @@
|
|
$SITE_RE = '[^:]+:\/\/([^\/]+)\/.*';
|
|
$NumLocalCollected = 0;
|
|
$NumRemoteCollected = 0;
|
|
+$max_redir = 6;
|
|
# LOGFILE, ERRFILE -- files for logging
|
|
|
|
### *TO CHANGE TRAVERSAL*
|
|
@@ -105,6 +106,7 @@
|
|
$LOGFILENAME = ".wg_log";
|
|
# $STARTFILE = ".wgstart";
|
|
$WGADDSEARCH = ".wgfilter-box";
|
|
+$SITECACHE = ".wgsitecache";
|
|
|
|
$ROBOTNAME = "HTTPGET";
|
|
|
|
@@ -187,22 +189,22 @@
|
|
|
|
# Initialize variables to avoid warnings
|
|
($title, $urlpath, $traverse_type, $explicit_only, $numhops,
|
|
- $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem) =
|
|
- ('','','','','','','','','','','');
|
|
+ $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, $locale, $charset) =
|
|
+ ('','','','','','','','','','','','','');
|
|
|
|
($title, $urlpath, $traverse_type, $explicit_only, $numhops,
|
|
- $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, @urllist) = ReadConfig($archivepwd);
|
|
+ $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, $locale, $charset, @urllist) = ReadConfig($archivepwd);
|
|
|
|
# open logs
|
|
&open_logs();
|
|
|
|
print LOGFILE "From Configuration:\n";
|
|
my(@configlist) = qw(title urlpath traverse_type explicit_only numhops
|
|
- nhhops local_limit remote_limit addboxes vhost usemaxmem ) ;
|
|
+ nhhops local_limit remote_limit addboxes vhost usemaxmem locale charset) ;
|
|
foreach $item (@configlist)
|
|
{
|
|
$value = '';
|
|
- eval "$value = \$$item";
|
|
+ eval "\$value = \$$item";
|
|
print LOGFILE " $item: $value\n";
|
|
}
|
|
print LOGFILE " urllist: @urllist\n\n";
|
|
@@ -231,6 +233,7 @@
|
|
$MAPFILE = "$archivepwd/$MAPFILE";
|
|
$TEMPROBOTFILE = "$archivepwd/$TEMPROBOTFILE";
|
|
$WGADDSEARCH = "$archivepwd/$WGADDSEARCH";
|
|
+$SITECACHE = "$archivepwd/$SITECACHE";
|
|
|
|
($archiveprot, $archivehost, $archiveport, $archivepath) =
|
|
&url::parse_url($archiveurl);
|
|
@@ -252,7 +255,7 @@
|
|
|
|
# read in the site configuration
|
|
&siteconf::ReadConf($vhost);
|
|
-&siteconf::LoadCache();
|
|
+&siteconf::LoadCache("$SITECACHE");
|
|
|
|
###############
|
|
### PHASE 1 ###
|
|
@@ -398,7 +401,7 @@
|
|
&close_logs();
|
|
|
|
# remove the robots file
|
|
-system("rm -rf $TEMPROBOTFILE");
|
|
+unlink($TEMPROBOTFILE);
|
|
|
|
#----------------------
|
|
#change the dir back
|
|
@@ -751,7 +754,7 @@
|
|
my($prot, $host, $port, $path) = &url::parse_url($url);
|
|
|
|
# if the protocol isn't http, assume it's good
|
|
- if($prot!~/http/i){
|
|
+ if(!defined($prot) || $prot!~/http/i){
|
|
return 1;
|
|
}
|
|
|
|
@@ -800,6 +803,7 @@
|
|
my($output);
|
|
my($olddata, $newdata);
|
|
my($newprot, $newhost, $newport, $newpath, $url);
|
|
+ my($redcount)=0;
|
|
|
|
# make the $url
|
|
$url = "http://$host:$port/robots.txt";
|
|
@@ -815,6 +819,7 @@
|
|
while($output ne ""){
|
|
# more for error?
|
|
if($output=~/^error/i){
|
|
+ truncate($TEMPROBOTFILE,0);
|
|
print ERRFILE "Error with getting $url\n";
|
|
# print LOGFILE "Error with getting $url\n";
|
|
last;
|
|
@@ -822,7 +827,13 @@
|
|
|
|
# look at output for redirect -- store redirects in file, too
|
|
if($output=~/^Redirect: (.*)$/){
|
|
- print LOGFILE "Redirected to: $1...";
|
|
+ if ($redcount >= $max_redir) {
|
|
+ truncate($TEMPROBOTFILE,0);
|
|
+ print ERRFILE "Too many redirections with $url\n";
|
|
+ last;
|
|
+ }
|
|
+ $redcount++;
|
|
+ print LOGFILE "Redirected to: $1...\n";
|
|
|
|
# see if we have the redirected server
|
|
($newprot, $newhost, $newport, $newpath) = &url::parse_url($1);
|
|
@@ -843,6 +854,7 @@
|
|
}
|
|
}else{
|
|
# we've got it, or there's an error...
|
|
+ truncate($TEMPROBOTFILE,0);
|
|
last;
|
|
}
|
|
}
|
|
@@ -894,6 +906,7 @@
|
|
sub geturl2file{
|
|
my($url) = @_;
|
|
my($output, $link, $file, $oldfile, @aliases);
|
|
+ my($redcount)=0;
|
|
|
|
# check if we have that in stock (we know it's not local)
|
|
if (defined($URL2FILE{$url})) {
|
|
@@ -930,6 +943,7 @@
|
|
while($output ne ""){
|
|
# more for error?
|
|
if($output=~/^error/i){
|
|
+ truncate($file,0);
|
|
print ERRFILE "Error with getting $url: $output\n";
|
|
# print LOGFILE "Error with getting $url\n";
|
|
last;
|
|
@@ -937,6 +951,12 @@
|
|
|
|
# look at output for redirect -- store redirects in file, too
|
|
if($output=~/^Redirect: (.*)$/){
|
|
+ if ($redcount >= $max_redir) {
|
|
+ truncate($file,0);
|
|
+ print ERRFILE "Too many redirections with $url\n";
|
|
+ last;
|
|
+ }
|
|
+ $redcount++;
|
|
&ungetnewname(); # rewind the name counter
|
|
# The next get will overwrite the unnecessary file
|
|
|
|
@@ -970,6 +990,7 @@
|
|
}
|
|
}else{
|
|
# we've got it, or there's an error...
|
|
+ truncate($file,0);
|
|
last;
|
|
}
|
|
}
|
|
@@ -1159,6 +1180,15 @@
|
|
($prot, $host, $port, $path) = &url::parse_url($url);
|
|
#print "URL after parsing: $prot://$host:$port$path\n";
|
|
|
|
+ next if !defined($prot);
|
|
+ if (!defined($port) ||
|
|
+ ($port eq '80' && $prot =~ /^https?$/) ||
|
|
+ ($port eq '21' && $prot eq 'ftp')) {
|
|
+ $port = '';
|
|
+ } else {
|
|
+ $port = ":$port";
|
|
+ }
|
|
+
|
|
# make sure the path has a preceding /
|
|
$path = "/$path" if $path!~/^\//;
|
|
|
|
@@ -1177,7 +1207,7 @@
|
|
# $host = "$a.$b.$c.$d";
|
|
# }
|
|
|
|
- $url = "$prot://$host:$port$path";
|
|
+ $url = "$prot://$host$port$path";
|
|
#print "URL after normalization: $url\n";
|
|
|
|
# strip off any #text
|