#!/usr/bin/perl # # kjl/29dec2002 # # create_db.pl # # Create miscellaneous databases for bibliometric ranking # and other purposes. # # Usage: # # create_db.pl [-H path] # # where path is the path to the broker, usually # /usr/local/harvest/brokers/YOUR_BROKER # # You can invoke it directly at command line or # embed it into glimpseindex script. # # Add the perl library path to the @INC path @P = split('/', $0); pop @P; $medir = join('/',@P); unshift(@INC, $medir); use DB_File; require 'normal.pm'; # # Edit below if necessary # #$DEBUG = 1; # Where to put the database files $db_rev = "db.ref_rev"; $db_url = "db.ref_url"; $db_cache = "db.ref_cache"; $db_clean = "db.ref_clean"; # Default broker directory. This can be overriden by # "-H /your/path" argument. $BROKERDIR = "/usr/local/harvest/brokers/tengu.local"; # End of configuration # No changes necessary below # Where is the broker directory $DATADIR = parse_arg (); if ($DATADIR eq "") { $DATADIR = $BROKERDIR; } chdir $DATADIR or die "Can't chdir to $DATADIR: $!"; unlink_db_file (); tie_db (); put_log ("Parsing objects"); parse_obj (); put_log ("Building cache databases"); build_cache (); put_log ("Building cleaned up database"); clean_base (); untie_db (); put_log ("Finished"); # End of program # # Parse arguments and fetch the broker directory # sub parse_arg { my $i = 0; my $datadir = ""; foreach (@ARGV) { $i++; last if /\-H/o; } $datadir = $ARGV[$i] if ($i <= $#ARGV); return $datadir; } # # remove database files # sub unlink_db_file { unlink $db_fwd, $db_rev, $db_url, $db_clean, $db_cache; } # # open databases # sub tie_db { # Allow duplicates in BTREE databases. $DB_BTREE->{'flags'} = R_DUP; $REV_STAT = tie %rev_hash, 'DB_File', $db_rev, O_RDWR|O_CREAT, 0666, $DB_BTREE or die "Can't open DB file $db_rev: $!"; $URL_STAT = tie %url_hash, 'DB_File', "$db_url", O_RDWR|O_CREAT, 0666, $DB_HASH or die "Can't open DB file $db_url: $!"; $CLEAN_STAT = tie %clean_hash, 'DB_File', "$db_clean", O_RDWR|O_CREAT, 0666, $DB_BTREE or die "Can't open DB file $db_clean: $!"; $CACHE_STAT = tie %cache_hash, 'DB_File', "$db_cache", O_RDWR|O_CREAT, 0666, $DB_HASH or die "Can't open DB file $db_cache: $!"; } # # close databases # sub untie_db { undef $REV_STAT; untie %rev_hash; undef $URL_STAT; untie %url_hash; undef $CLEAN_STAT; untie %clean_hash; undef $CACHE_STAT; untie %cache_hash; } # # print current local time # sub print_time { my $now = localtime(); print "$now"; } # # log string # sub put_log { my $log = shift; print_time (); print "\t$log\n"; } # # recurse through broker's object directory and create database # sub parse_obj { my $OBJDIR = "objects"; my ($dir, $obj); opendir (DIR, "$OBJDIR"); while ($dir = readdir (DIR)) { next if (($dir eq "\.") or ($dir eq "\.\.")); opendir (SUBDIR, "$OBJDIR/$dir"); while ($obj = readdir (SUBDIR)) { next if (($obj eq "\.") or ($obj eq "\.\.")); process_obj ("$OBJDIR/$dir/$obj"); } closedir (SUBDIR); } closedir (DIR); } # # process one object # sub process_obj { my $obj = shift; my ($url, $ttype, $ref, @ref_ary, $proto, $t); my ($host, $path, $file, $prev); open (IN, "<$obj") or die "Can't open $obj: $!"; ($url, $ttype, $ref) = extract_ref(); close (IN); chop $url; @ref_ary = sort (split /\n/, $ref); next if (($url eq "") || ($ttype eq "") || ($ref_ary[0] eq "") || (($ttype ne "HTML") && ($ttype ne "HTTP-Query"))); print "$obj\t$url\t$ttype\n"; # ($proto, $host, $path, $file) = $url =~ /(\S+):\/\/(\S+?)\/(.*?)\/(.*)/; ($proto, $t) = $url =~ /(\w+?):(.*)/; ($host) = $t =~ /\/\/(\S+)/; if ($host =~ /\//) { ($host, $t) = $host =~ /(\S+?)\/(.*)/; ($path, $file) = $t =~ /(.*)\/(.*)/; } $prev = ""; while ($_ = shift @ref_ary) { next if /^#/; s/(\S+)#.*/$1/; if (/\w+:\S+/) { $link = $_; } else { $link = "http://$host/$path/$_"; } $link = normal::normalize_uri ($link); next if ($prev eq $link); $prev = $link; build_db ($url, $link); } } # # Build databases # sub build_db { my $url = shift; my $link = shift; $rev_hash {$link} = $url; $url_hash {$url} = 1; # any value, we need just exists $key } # # Build a db with URL -> number of references to the URL. # This currently only counts direct hits only. # sub build_cache { my $key, $n; foreach $key (keys %url_hash) { $n = $REV_STAT->get_dup($key); print "$n\t$key\n"; $cache_hash {$key} = $n if ($n > 1); } } # # Build a databse containing only URLs, which is also in the Broker. # sub clean_base { my @ref_ary; foreach $key (keys %url_hash) { if (exists $rev_hash {$key}) { print "Copying $key->\n"; @ref_ary = $REV_STAT->get_dup ($key); foreach (@ref_ary) { print "\t$_\n"; $clean_hash {$key} = $_; } } else { print "$key\tignored\n"; } } } # # extract references from object # sub extract_ref { my ($ttype, $url, %SOIF) = soif_parse(); # We only need FILE. return if ($ttype ne "FILE"); return if (!exists $SOIF{'type'}); return if (!exists $SOIF{'url-references'}); return ("$url\n", $SOIF{'type'}, $SOIF{'url-references'}); } # # This is from soif.pl. # soif_parse - Returns an associative array containing the SOIF, # the template type, and the URL. # sub soif_parse { print "Inside soif_parse.\n" if ($debug); return () if (eof(IN)); # DW my $template_type = "UNKNOWN"; my $url = "UNKNOWN"; my %SOIF; undef %SOIF; my ($attr, $vsize, $value, $end_value); while () { print "READING input line: $_\n" if ($debug); last if (/^\@\S+\s*{\s*\S+\s*$/o); } if (/^\@(\S+)\s*{\s*(\S+)\s*$/o) { $template_type = $1, $url = $2; } else { return ($template_type, $url, %SOIF); # done } while () { if (/^\s*([^{]+){(\d+)}:\t(.*\n)/o) { $attr = $1; $vsize = $2; $value = $3; if (length($value) < $vsize) { $nleft = $vsize - length($value); $end_value = ""; $x = read(IN, $end_value, $nleft); die "Cannot read $nleft bytes: $!" if ($x != $nleft); $value .= $end_value; undef $end_value; } chop ($value) if ($value =~ /\n$/); $SOIF{$attr} = $value; undef $value; undef $end_value; next; } last if (/^}/o); } return ($template_type, $url, %SOIF); } =unused # # return file name of a string # sub basename { my $path = shift; $path =~ /.*\/(.*)/; return $1; } # # return directory name of a string # sub dirname { my $path = shift; $path =~ /(.*)\/.*/; return $1; } =cut