#!/usr/bin/perl -w
########################################
#Bookmark Crawler
#Copyright John Andrews 2002
#email questions to grumpyoldman@users.sourceforge.net
##########################################################################
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
###########################################################################
###
#***Windows version***Windows version***Windows version***Windows version
###
##Settings below
##There should be only three paths to set.
##your bookmark file, works with Mozilla, Opera, Netscape, Galeon
###
my $bookmark_file =
'C:\WINDOWS\Application Data\Mozilla\Profiles\default\kg9p6pdg.slt\bookmarks.html';
##port to work off of
my $port = 6800;
##path and name of the database to make.
my $database = 'C:\book.db';
##end settings
use strict;
use IO::Socket;
##
#
##check to see if the bookmarkfile exists
my $is_bk = "";
unless ( -e $bookmark_file ) {
$is_bk =
'the bookmark file missing at "'
. $bookmark_file
. '"
';
}
###rplaces "\r\n" -- more universal
my $EOL = "\015\012"; ## a CR LF pair for the server
my %formdata;
my ( $client_info, $query_string, @word );
my $server = IO::Socket::INET->new(
LocalPort => $port,
Type => SOCK_STREAM,
Reuse => 1,
Listen => 10
)
or die "$@\n";
my $client;
while ( $client = $server->accept() ) {
while (<$client>) {
if (m/GET\W*?\?(.+?)\s/i) { $query_string = $1; }
last if /^\r\n$/;
$client_info .= $_;
}
##put conditional to keep -w happy
if ($query_string) {
&Parse_Form;
}
##
#the three lins below are redundent, but -w nagged me when I did it the short way
my $palce_holder;
if ( $formdata{'search'} ) { $palce_holder = "$formdata{'search'}" }
else { $palce_holder = "" }
###Hold client info for something useful
##$client_stuff;
select $client;
$| = 1;
print $client "HTTP/1.0 200 OK\r\n";
print $client "Connection: close\r\n";
print $client "Content-type: text/html\r\n\r\n";
##print the front page
print $client qq|
Bookmark Search
$is_bk
|;
##may also seem redundent, but it is keeping -w happy
my $last_search;
if ( $formdata{'ls'} ) { $last_search = $formdata{'ls'} }
else { $last_search = 0 }
if ( $formdata{'rebuild'} ) {
&rebuild;
}
##print the output of a search
if ( $formdata{'search'} ) {
my $search = $formdata{'search'};
my @word = split ( / /, $search );
print $client ' ';
##quick count of words search for
my $wc = 0;
##these lines are just to bold out the KWs being searched for
foreach (@word) {
print $client " $_ ";
$wc++;
if ( $wc && $wc < ( $#word + 1 ) ) { print $client "and"; }
}
print $client ' ';
##keep strict happy
my (
@grabbed, $this_entry, $number, $list,
%grabbed, $domain, %tmp, @search_nd_db
);
#open the databade and put it into an array
open( SEARCH, "$database" );
my @search_db = ;
close(SEARCH);
##take out the dupes
foreach my $rip_dupes (@search_db) {
push @search_nd_db, $rip_dupes unless $tmp{$rip_dupes}++;
}
##run through each record
foreach $this_entry (@search_nd_db) {
##creat the bad match..this is here so that sites that match
## one word but not the rest do not get added
my $bad_match = 0;
##keep strict happy, set up local variables
my ( $this_url, $number, $info, $title, $this_desk );
##the key word search
foreach my $this_word (@word) {
##is the word in there? or in the title..
if ( $this_entry =~ m/\W$this_word\W|^$this_word\W/i ) {
##extract title
if ( $this_entry =~ m/\\\\\\(.+?)\|\|\|/ ) { $title = $1 }
else { $title = "---" }
##extract description
if ( $this_entry =~ m/\|\|\|(.+?)\/\/\// ) {
$this_desk = $1;
}
else { $this_desk = "---" }
##extract url
if ( $this_entry =~ m/^(.+?)\\/ ) { $this_url = $1 }
else { $this_entry = "---" }
##get the kw weight
if ( $this_entry =~ m/\|$this_word\t(\d+\.\d+)/i ) {
$number += $1;
}
##give a point for title
if ( $title =~ /$this_word/i ) { $number += 1 }
##give a point for description
if ( $this_desk =~ /$this_word/i ) { $number += 0.5 }
unless ($number) { $number = 0 }
}
##kill this url if only partial match
else { $bad_match = 1 }
}
##more kill
unless ($bad_match) {
##
##stack them
$info = "$number\t$this_url\t$title\t$this_desk";
push @grabbed, $info;
}
}
my @sorted = sort { $b cmp $a } @grabbed;
my $total = $#sorted + 1;
print $client "has $total results";
print $client '';
my $count = 1;
my $last_printed = 0;
foreach $list (@sorted) {
my @url_info = split ( /\t/, $list );
##give a no score a 'o.oo'
if ( $url_info[0] eq "" ) { $url_info[0] = "0.00" }
if ( $last_search < $count && $count < ( $last_search + 11 ) ) {
print $client ''
. $url_info[2]
. '
' . "\n";
print $client ' '
. "$count \ "
. $url_info[3]
. '' . "\n";
print $client 'http://'
. $url_info[1]
. ' Weight ='
. $url_info[0]
. "
\n";
$last_printed = $count;
}
$count++;
}
print $client '
';
if ( $last_printed > 10 ) {
##set up the back botton
my $prev_count = ( $last_search - 10 );
##don't want it going past zerro
if ( $prev_count < 0 ) { $prev_count = 0 }
print $client '<--Back ';
}
if ( $last_printed < ( $#sorted + 1 ) ) {
print $client 'Next-->';
}
}
print $client '';
print $client qq| |;
##added for windows because of the fork issue
close($client);
}
continue {
close($client); #kills hangs
$query_string = 0;
undef %formdata;
}
sub rebuild {
##wipe out old db if wipe out is selected, could use unlink here
##-- but I think it might be better not to, because of permission issuse
if ( $formdata{'wipe_out'} ) {
open( DB, ">$database" );
print DB "";
close(DB);
}
##build list of bookmarfs aleady indexed
open( DBPRE, "$database" );
my @here_already = ;
close(DBPRE);
my @old_url;
##grab urls in $database
my $i;
foreach (@here_already) {
( $old_url[ $i++ ] ) = split ( /\\\\\\/, $_ );
}
##make a quick hash from urls already indexed
my %old_urls;
foreach my $urls (@old_url) {
$old_urls{$urls} = 1;
}
print $client "";
###Stop words to be removed from the body
my @stop_words = qw(
at nbsp a an amp and are arial as be but by can com do font for from gif have
he helvetica her his href i if in is it jpg net of on or org our sans ser she
size that the this to us we with you your has such accordance suitable
according having than all http herein also however their then another into there
invention thereby therefore thereof its thereto because means these been
not they being now those claim thus comprises onto corresponding use could
other various described particularly was desired preferably were preferred what
does present when each provide where embodiment provided whereby fig provides
wherein figs relatively which respectively while said who further should will
generally since had some would about above across adj after again against almost
alone along although always am among any anybody anyone anything anywhere apart
around aside away before behind below besides between beyond both cannot deep did
doing done down downwards during either else enough etc even ever every everybody
everyone except far few forth get gets got hardly here herself him himself how
indeed instead inward itself just kept many maybe might mine more most mostly
much must myself near neither next no nobody none nor nothing nowhere off often
only others ought ours out outside over own p per please plus pp quite rather
really s seem self selves several shall so somebody somewhat still theirs them themselves thorough
thoroughly through together too toward towards under until up upon very well
whatever whenever whether whom whose within without yet young yourself accordingly
already apparently arise became briefly came certain certainly different due especially
following found gave give given gives giving gone immediately keep largely like made
mainly make mug nearly necessarily nos noted obtain obtained overall owing past
perhaps please possibly predominantly previously probably prompt promptly ready
regardless same seen show showed shown shows similarly slightly sometime sometimes
soon strongly substantially successfully though throughout unless used usefully
using usually why widely www _
);
my %stop_words;
##Make a hash from the stop words
foreach my $word (@stop_words) {
$stop_words{$word} = 1;
}
open( DD, "$bookmark_file" );
my @ww = ;
close(DD);
my ( $bmcontent, $link, @links, %tmp, @lines );
foreach (@ww) { $bmcontent .= $_ }
while ( $bmcontent =~ /(<\w*\s*\w*=.+)(http.*?)(\")/gi ) {
$link = $2;
$link =~ s/^"//;
$link =~ s/".+//;
push @links, $link unless $tmp{$link}++;
}
##
#adding Opera hack
##
while ( $bmcontent =~ /(URL\=)(\S+)/g ) {
$link = $2;
push @links, $link unless $tmp{$link}++;
}
#@links = qw ( www.linux.com www-3.ibm.com);
foreach my $this_link (@links) {
##skip self, causes a hang
if ( $this_link =~ /127\.0\.0\.1\:$port|locolhost\:$port/ ) { next }
#my $url = $this_link;
##the page frabbing sub down below
##rip off the http:// IO doesn't use it
$this_link =~ s/http:\/\///g; ## take out the 'http://' out
###
#Check to see that the url isn't already indexed
###
if ( $old_urls{$this_link} ) { next }
my $content = page_grabber($this_link);
unless ($content) { next }
###
#Redirect check
###
if ( $content =~ /\015\012Location: http:\/\/(.*)\015\012/ ) {
my $redirect = $1;
$redirect =~ s/[\s]//g;
undef $content;
print "\nredirected to $redirect\n";
$content = page_grabber($redirect);
unless ($content) { next }
}
###Strip out the headers, HTML and non words
$content =~ m/(.+?)<\/title>/i;
my $title = $1;
unless ($title) { $title = '---' }
$content =~ m/description\"\s*content="(.+?)"/i;
my $desc = $1;
unless ($desc) { $desc = '---' }
$content =~ s/\n/ /g;
$content =~ s/\r/ /g;
$content =~ s/\t/ /g;
$content =~ s/.+/ /gs;
$content =~ s/[\W+]/ /gs;
$content =~ s/\s\d\s/ /gs;
$content =~ s/\&\w+\;/ /gs;
##removing of the stopwords
#foreach (@stop_words) { $content =~ s/\W$_\W/ /gi }
##splitting up of the content into words
my @words = split /\s+/, $content;
my $total_words = 0; # the total number of words in the content
my %words; # a hash of all the words found and the freq.
my %words_by_frequency;
foreach my $word (@words) {
##switch to lower case
$word =~ tr/A-Z/a-z/;
next if ( exists $stop_words{$word} );
##counts individual words
$words{$word}++;
##counts total words
$total_words++;
}
##stacks up the words into one big hash
foreach my $word ( keys %words ) {
my $count = $words{$word};
unless ( exists $words_by_frequency{$count} ) {
$words_by_frequency{$count} = [];
}
push @{ $words_by_frequency{$count} }, $word;
}
unless ($total_words) {
print $client "No words found in content\n";
}
my $list;
my $i = 1;
##sorts the hash by from largest to smallest
foreach my $count ( sort { $b <=> $a } ( keys %words_by_frequency ) ) {
##the words at this count
foreach my $word ( @{ $words_by_frequency{$count} } ) {
##percentage by devision
my $percentage = $words{$word} / $total_words * 100;
##format the percentage
$percentage =~ s/(\.\d\d).*/$1/;
##pick the ones to keep
##algo valume
if ( ( $percentage > 0.3 ) or ( $i < 100 ) ) {
$list .= "$word $percentage||";
$i++;
}
}
}
open( DB, ">>$database" );
flock DB, 2;
print DB "${this_link}\\\\\\${title}\|\|\|$desc\/\/\/${list}\n";
flock DB, 8;
close(DB);
print " sliced and diced\n"
} ###this link end
print "all done\n";
print 'back to search index..';
###Done with the db build
} #end rebuild section
#####Parcer
sub Parse_Form {
my ( @pairs, @getpairs, $key, $value, $pair );
#print $query_string;
@getpairs = split ( /&/, $query_string );
push ( @pairs, @getpairs );
foreach $pair (@pairs) {
( $key, $value ) = split ( /=/, $pair );
$key =~ tr/+/ /;
$key =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
$value =~ tr/+/ /;
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
$value =~ s/[\!\&;\`'\\\|"*?~<>^\(\)\[\]\{\}\$\n\r]//g;
##get rid of unneeded space
$value =~ s/^\s//g;
$value =~ s/\s+/ /g;
if ( $formdata{$key} ) {
$formdata{$key} .= ", $value";
}
else {
$formdata{$key} = $value;
}
}
}
####Grab webpage
sub page_grabber {
my ( $host, $document, $remote, $error );
##grab $url from parrent
my $url = shift (@_);
######
##split the domain from the page requested
######
if ( $url =~ /(.*?)\//i ) { ### catch everything up to the first /
##grab the domain for the call
$host = $1;
##grab the page request if it is there
$document = "/$'"; ## ok it's a host and document www.x.com/idx.htm
}
else {
##else just request the root of the domain
$host = $url; ## nope, it's just a host
$document = "/";
}
###
#seperate port if there
###
my $port;
if ( $host =~ /(.*?)\:(\d+)/ ) {
$host = $1;
$port = $2;
}
else { $port = 80 }
print "connecting to $host:$port ..";
$remote = IO::Socket::INET->new(
Proto => "tcp",
Timeout => "5",
PeerAddr => $host,
PeerPort => "http($port)",
);
my $content;
if ($remote) {
$remote->autoflush(1);
print ">>flushed>>";
print $remote "GET $document HTTP/1.1" . $EOL;
print $remote "Host: $host" . $EOL;
print $remote
'User-Agent: Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.0.1) Gecko/20020823 Netscape/7.0'
. $EOL;
print $remote
'Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,text/css,*/*;q=0.1'
. $EOL;
print $remote "Connection: close" . $EOL;
print $remote $EOL;
print $remote $EOL;
print " asked for $document..";
while (<$remote>) {
$content .= $_;
#print
}
close $remote;
print "closed connection";
}
else {
print " cannot connect\n";
}
return $content;
}