;
close(DBPRE);
my @old_url;
##grab urls in $database
my $i;
foreach (@here_already) {
( $old_url[ $i++ ] ) = split ( /\\\\\\/, $_ );
}
##make a quick hash from urls already indexed
my %old_urls;
foreach my $urls (@old_url) {
$old_urls{$urls} = 1;
}
print $client "";
###Stop words to be removed from the body
my @stop_words = qw(
at nbsp a an amp and are arial as be but by can com do font for from gif have
he helvetica her his href i if in is it jpg net of on or org our sans ser she
size that the this to us we with you your has such accordance suitable
according having than all http herein also however their then another into there
invention thereby therefore thereof its thereto because means these been
not they being now those claim thus comprises onto corresponding use could
other various described particularly was desired preferably were preferred what
does present when each provide where embodiment provided whereby fig provides
wherein figs relatively which respectively while said who further should will
generally since had some would about above across adj after again against almost
alone along although always am among any anybody anyone anything anywhere apart
around aside away before behind below besides between beyond both cannot deep did
doing done down downwards during either else enough etc even ever every everybody
everyone except far few forth get gets got hardly here herself him himself how
indeed instead inward itself just kept many maybe might mine more most mostly
much must myself near neither next no nobody none nor nothing nowhere off often
only others ought ours out outside over own p per please plus pp quite rather
really s seem self selves several shall so somebody somewhat still theirs them themselves thorough
thoroughly through together too toward towards under until up upon very well
whatever whenever whether whom whose within without yet young yourself accordingly
already apparently arise became briefly came certain certainly different due especially
following found gave give given gives giving gone immediately keep largely like made
mainly make mug nearly necessarily nos noted obtain obtained overall owing past
perhaps please possibly predominantly previously probably prompt promptly ready
regardless same seen show showed shown shows similarly slightly sometime sometimes
soon strongly substantially successfully though throughout unless used usefully
using usually why widely www _
);
my %stop_words;
##Make a hash from the stop words
foreach my $word (@stop_words) {
$stop_words{$word} = 1;
}
open( DD, "$bookmark_file" );
my @ww = ;
close(DD);
my ( $bmcontent, $link, @links, %tmp, @lines );
foreach (@ww) { $bmcontent .= $_ }
while ( $bmcontent =~ /(<\w*\s*\w*=.+)(http.*?)(\")/gi ) {
$link = $2;
$link =~ s/^"//;
$link =~ s/".+//;
push @links, $link unless $tmp{$link}++;
}
##
#adding Opera hack
##
while ( $bmcontent =~ /(URL\=)(\S+)/g ) {
$link = $2;
push @links, $link unless $tmp{$link}++;
}
#@links = qw ( www.linux.com www-3.ibm.com);
foreach my $this_link (@links) {
##skip self, causes a hang
if ( $this_link =~ /127\.0\.0\.1\:$port|locolhost\:$port/ ) { next }
##the page frabbing sub down below
##rip off the http:// IO doesn't use it
$this_link =~ s/http:\/\///g; ## take out the 'http://' out
###
#Check to see that the url isn't already indexed
###
if ( $old_urls{$this_link} ) { next }
my $content = page_grabber($this_link);
unless ($content) { next }
###
#Redirect check
###
if ( $content =~ /\015\012Location: http:\/\/(.*)\015\012/ ) {
my $redirect = $1;
$redirect =~ s/[\s]//g;
undef $content;
print "\nredirected to $redirect\n";
$content = page_grabber($redirect);
unless ($content) { next }
}
###Strip out the headers, HTML and non words
$content =~ m/(.+?)<\/title>/i;
my $title = $1;
unless ($title) { $title = '---' }
$content =~ m/description\"\s*content="(.+?)"/i;
my $desc = $1;
unless ($desc) { $desc = '---' }
$content =~ s/\n/ /g;
$content =~ s/\r/ /g;
$content =~ s/\t/ /g;
$content =~ s/.+/ /gs;
$content =~ s/[\W+]/ /gs;
$content =~ s/\s\d\s/ /gs;
$content =~ s/\&\w+\;/ /gs;
##removing of the stopwords
#foreach (@stop_words) { $content =~ s/\W$_\W/ /gi }
##splitting up of the content into words
my @words = split /\s+/, $content;
my $total_words = 0; # the total number of words in the content
my %words; # a hash of all the words found and the freq.
my %words_by_frequency;
foreach my $word (@words) {
##switch to lower case
$word =~ tr/A-Z/a-z/;
next if ( exists $stop_words{$word} );
##counts individual words
$words{$word}++;
##counts total words
$total_words++;
}
##stacks up the words into one big hash
foreach my $word ( keys %words ) {
my $count = $words{$word};
unless ( exists $words_by_frequency{$count} ) {
$words_by_frequency{$count} = [];
}
push @{ $words_by_frequency{$count} }, $word;
}
unless ($total_words) {
print $client "No words found in content\n";
}
my $list;
my $i = 1;
##sorts the hash by from largest to smallest
foreach my $count ( sort { $b <=> $a } ( keys %words_by_frequency ) ) {
##the words at this count
foreach my $word ( @{ $words_by_frequency{$count} } ) {
##percentage by devision
my $percentage = $words{$word} / $total_words * 100;
##format the percentage
$percentage =~ s/(\.\d\d).*/$1/;
##pick the ones to keep
##algo valume
if ( ( $percentage > 0.3 ) or ( $i < 100 ) ) {
$list .= "$word $percentage||";
$i++;
}
}
}
open( DB, ">>$database" );
flock DB, 2;
print DB "${this_link}\\\\\\${title}\|\|\|${desc}\/\/\/${list}\n";
flock DB, 8;
close(DB);
print " sliced and diced\n"
} ###this link end
print "all done\n";
print 'back to search index..';
###Done with the db build
} #end rebuild section
#####Parcer
sub Parse_Form {
my ( @pairs, @getpairs, $key, $value, $pair );
#print $query_string;
@getpairs = split ( /&/, $query_string );
push ( @pairs, @getpairs );
foreach $pair (@pairs) {
( $key, $value ) = split ( /=/, $pair );
$key =~ tr/+/ /;
$key =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
$value =~ tr/+/ /;
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
##big strip out to take care of possible exploits
$value =~ s/[\!\&;\`'\\\|"*?~<>^\(\)\[\]\{\}\$\n\r]//g;
##get rid of unneeded space
$value =~ s/^\s//g;
$value =~ s/\s+/ /g;
if ( $formdata{$key} ) {
$formdata{$key} .= ", $value";
}
else {
$formdata{$key} = $value;
}
}
}
####Grab webpage
sub page_grabber {
my ( $host, $document, $remote, $error );
##grab $url from parrent
my $url = shift (@_);
######
##split the domain from the page requested
######
if ( $url =~ /(.*?)\//i ) { ### catch everything up to the first /
##grab the domain for the call
$host = $1;
##grab the page request if it is there
$document = "/$'"; ## ok it's a host and document www.x.com/idx.htm
}
else {
##else just request the root of the domain
$host = $url; ## nope, it's just a host
$document = "/";
}
###
#seperate port if there
###
my $port;
if ( $host =~ /(.*?)\:(\d+)/ ) {
$host = $1;
$port = $2;
}
else { $port = 80 }
print "connecting to $host:$port ..";
$remote = IO::Socket::INET->new(
Proto => "tcp",
Timeout => "5",
PeerAddr => $host,
PeerPort => "http($port)",
);
my $content;
if ($remote) {
$remote->autoflush(1);
print ">>flushed>>";
print $remote "GET $document HTTP/1.1" . $EOL;
print $remote "Host: $host" . $EOL;
print $remote
'User-Agent: Mozilla/5.0 Galeon/1.2.6 (X11; Linux i686; U;) Gecko/20020913 Debian/1.2.6-2'
. $EOL;
print $remote
'Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,text/css,*/*;q=0.1'
. $EOL;
print $remote "Connection: close" . $EOL;
print $remote $EOL;
print $remote $EOL;
print " asked for $document..";
while (<$remote>) {
$content .= $_;
#print
}
close $remote;
print "closed connection";
}
else {
print " cannot connect\n";
}
return $content;
}