#!/usr/bin/perl -w ######################################## #Bookmark Crawler #Copyright John Andrews 2002 #email questions to grumpyoldman@users.sourceforge.net ########################################################################## # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ########################################################################### #How to use: #Set the path/name to your bookmarks, #Set the path/name to your database (flat text file), and #Fire up the script #Point your browser to http://127.0.0.1:6800/ # #...if you have a problem try changing the $port setting first ############################################################### ## #***Linux/BSD/Mac version***Linux/BSD/Mac version***Linux/BSD/Mac version ## ##Settings below ##There should be only three paths to set. ##your bookmark file, works with Mozilla, Opera, Netscape, Galeon my $bookmark_file = '/home/john/.galeon/bookmarks.xbel'; ##path and name of the database to make. my $database = "book.db"; ##port to work off of my $port = 6800; ##end settings use strict; use IO::Socket; ### # ##check to see if the bookmarkfile exists my $is_bk = ""; unless ( -e $bookmark_file ) { $is_bk = 'the bookmark file missing at "' . $bookmark_file . '"
'; } ###rplaces "\r\n" -- more universal my $EOL = "\015\012"; ## a CR LF pair for the server my %formdata; my ( $client_info, $query_string, @word ); ### #perl in a UNIX enviornment fork()S, in windows it does a fork emulation # if this forking happens in windows the script dies, #if it doesn't in linux the system sucks memory, so we don't kill it down below sub Wait { wait; #wait needed to keep pids from building up } $SIG{CHLD} = \&Wait; my $server = IO::Socket::INET->new( LocalPort => $port, Type => SOCK_STREAM, Reuse => 1, Listen => 10 ) or die "$@\n"; my $client; while ( $client = $server->accept() ) { next if my $pid = fork; die "fork - $!\n" unless defined $pid; ##Grab the handoff from client if you want # while (<$client>) { if (m/GET\W*?\?(.+?)\s/i) { $query_string = $1; } last if /^\r\n$/; $client_info .= $_; } ##put conditional to keep -w happy if ($query_string) { &Parse_Form; } ## #the three lins below are redundent, but -w nagged me when I did it the short way my $palce_holder; if ( $formdata{'search'} ) { $palce_holder = "$formdata{'search'}" } else { $palce_holder = "" } ###Hold client info for something useful ##$client_stuff; select $client; $| = 1; print $client "HTTP/1.0 200 OK\r\n"; print $client "Connection: close\r\n"; print $client "Content-type: text/html\r\n\r\n"; ##print the front page print $client qq| Bookmark Search $is_bk

Bookmark Search



new search

Add new bookmarks?
Rebuild Database?
could take a very long time
|; ##may also seem redundent, but it is keeping -w happy my $last_search; if ( $formdata{'ls'} ) { $last_search = $formdata{'ls'} } else { $last_search = 0 } if ( $formdata{'rebuild'} ) { &rebuild; } ##print the output of a search if ( $formdata{'search'} ) { my $search = $formdata{'search'}; my @word = split ( / /, $search ); print $client '  '; ##quick count of words search for my $wc = 0; ##these lines are just to bold out the KWs being searched for foreach (@word) { print $client " $_ "; $wc++; if ( $wc && $wc < ( $#word + 1 ) ) { print $client "and"; } } print $client ' '; ##keep strict happy my ( @grabbed, $this_entry, $number, $list, %grabbed, $domain, %tmp, @search_nd_db ); #open the databade and put it into an array open( SEARCH, "$database" ); my @search_db = ; close(SEARCH); ##take out the dupes foreach my $rip_dupes (@search_db) { push @search_nd_db, $rip_dupes unless $tmp{$rip_dupes}++; } ##run through each record foreach $this_entry (@search_nd_db) { ##creat the bad match..this is here so that sites that match ## one word but not the rest do not get added my $bad_match = 0; ##keep strict happy, set up local variables my ( $this_url, $number, $info, $title, $this_desk ); ##the key word search foreach my $this_word (@word) { ##is the word in there? or in the title.. if ( $this_entry =~ m/\W$this_word\W|^$this_word\W/i ) { ##extract title if ( $this_entry =~ m/\\\\\\(.+?)\|\|\|/ ) { $title = $1 } else { $title = "---" } ##extract description if ( $this_entry =~ m/\|\|\|(.+?)\/\/\// ) { $this_desk = $1; } else { $this_desk = "---" } ##extract url if ( $this_entry =~ m/^(.+?)\\/ ) { $this_url = $1 } else { $this_entry = "---" } ##get the kw weight if ( $this_entry =~ m/\|$this_word\t(\d+\.\d+)/i ) { $number += $1; } ##give a point for title if ( $title =~ /$this_word/i ) { $number += 1 } ##give a point for description if ( $this_desk =~ /$this_word/i ) { $number += 0.5 } unless ($number) { $number = 0 } } ##kill this url if only partial match else { $bad_match = 1 } } ##more kill unless ($bad_match) { ## ##stack them $info = "$number\t$this_url\t$title\t$this_desk"; push @grabbed, $info; } } my @sorted = sort { $b cmp $a } @grabbed; my $total = $#sorted + 1; print $client "has $total results"; print $client '
'; my $count = 1; my $last_printed = 0; foreach $list (@sorted) { my @url_info = split ( /\t/, $list ); ##give a no score a 'o.oo' if ( $url_info[0] eq "" ) { $url_info[0] = "0.00" } if ( $last_search < $count && $count < ( $last_search + 11 ) ) { print $client '' . $url_info[2] . '
' . "\n"; print $client ' ' . "$count \ " . $url_info[3] . '
' . "\n"; print $client 'http://' . $url_info[1] . ' Weight =' . $url_info[0] . "

\n"; $last_printed = $count; } $count++; } print $client '
'; if ( $last_printed > 10 ) { ##set up the back botton my $prev_count = ( $last_search - 10 ); ##don't want it going past zerro if ( $prev_count < 0 ) { $prev_count = 0 } print $client '<--Back '; } if ( $last_printed < ( $#sorted + 1 ) ) { print $client 'Next-->'; } } print $client '
'; print $client qq| |; close($client); exit(fork); } continue { close($client); #kills hangs kill CHLD => -$$; } sub rebuild { ##wipe out old db if wipe out is selected, could use unlink here ##-- but I think it might be better not to, because of permission issuse if ( $formdata{'wipe_out'} ) { open( DB, ">$database" ); print DB ""; close(DB); } ##build list of bookmarfs aleady indexed open( DBPRE, "$database" ); my @here_already = ; close(DBPRE); my @old_url; ##grab urls in $database my $i; foreach (@here_already) { ( $old_url[ $i++ ] ) = split ( /\\\\\\/, $_ ); } ##make a quick hash from urls already indexed my %old_urls; foreach my $urls (@old_url) { $old_urls{$urls} = 1; } print $client "
";

    ###Stop words to be removed from the body
    my @stop_words = qw(
      at nbsp a an amp and are arial as be but by can com do font for from gif have
      he helvetica her his href i if in is it jpg net of on or org our sans ser she
      size that the this to us we with you your has such accordance suitable
      according having than all http herein also however their then another into there
      invention thereby therefore thereof its thereto because means these been
      not they being now those claim thus comprises onto corresponding use could
      other various described particularly was desired preferably were preferred what
      does present when each provide where embodiment provided whereby fig provides
      wherein figs relatively which respectively while said who further should will
      generally since had some would about above across adj after again against almost
      alone along although always am among any anybody anyone anything anywhere apart
      around aside away before behind below besides between beyond both cannot deep did
      doing done down downwards during either else enough etc even ever every everybody
      everyone except far few forth get gets got hardly here herself him himself how
      indeed instead inward itself just kept many maybe might mine more most mostly
      much must myself near neither next no nobody none nor nothing nowhere off often
      only others ought ours out outside over own p per please plus pp quite rather
      really s seem self selves several shall so somebody somewhat still theirs them themselves thorough
      thoroughly through together too toward towards under until up upon very well
      whatever whenever whether whom whose within without yet young yourself accordingly
      already apparently arise became briefly came certain certainly different due especially
      following found gave give given gives giving gone immediately keep largely like made
      mainly make mug nearly necessarily nos noted obtain obtained overall owing past
      perhaps please  possibly predominantly previously probably prompt promptly ready
      regardless same seen show showed shown shows similarly slightly sometime sometimes
      soon strongly substantially successfully though throughout unless used usefully
      using usually why widely www _
    );
    my %stop_words;

    ##Make a hash from the stop words
    foreach my $word (@stop_words) {
        $stop_words{$word} = 1;
    }
    open( DD, "$bookmark_file" );
    my @ww = 
; close(DD); my ( $bmcontent, $link, @links, %tmp, @lines ); foreach (@ww) { $bmcontent .= $_ } while ( $bmcontent =~ /(<\w*\s*\w*=.+)(http.*?)(\")/gi ) { $link = $2; $link =~ s/^"//; $link =~ s/".+//; push @links, $link unless $tmp{$link}++; } ## #adding Opera hack ## while ( $bmcontent =~ /(URL\=)(\S+)/g ) { $link = $2; push @links, $link unless $tmp{$link}++; } #@links = qw ( www.linux.com www-3.ibm.com); foreach my $this_link (@links) { ##skip self, causes a hang if ( $this_link =~ /127\.0\.0\.1\:$port|locolhost\:$port/ ) { next } ##the page frabbing sub down below ##rip off the http:// IO doesn't use it $this_link =~ s/http:\/\///g; ## take out the 'http://' out ### #Check to see that the url isn't already indexed ### if ( $old_urls{$this_link} ) { next } my $content = page_grabber($this_link); unless ($content) { next } ### #Redirect check ### if ( $content =~ /\015\012Location: http:\/\/(.*)\015\012/ ) { my $redirect = $1; $redirect =~ s/[\s]//g; undef $content; print "\nredirected to $redirect\n"; $content = page_grabber($redirect); unless ($content) { next } } ###Strip out the headers, HTML and non words $content =~ m/(.+?)<\/title>/i; my $title = $1; unless ($title) { $title = '---' } $content =~ m/description\"\s*content="(.+?)"/i; my $desc = $1; unless ($desc) { $desc = '---' } $content =~ s/\n/ /g; $content =~ s/\r/ /g; $content =~ s/\t/ /g; $content =~ s/.+<body//gis; $content =~ s/<\/body.+//si; $content =~ s/<(.*?)>/ /gs; $content =~ s/[\W+]/ /gs; $content =~ s/\s\d\s/ /gs; $content =~ s/\&\w+\;/ /gs; ##removing of the stopwords #foreach (@stop_words) { $content =~ s/\W$_\W/ /gi } ##splitting up of the content into words my @words = split /\s+/, $content; my $total_words = 0; # the total number of words in the content my %words; # a hash of all the words found and the freq. my %words_by_frequency; foreach my $word (@words) { ##switch to lower case $word =~ tr/A-Z/a-z/; next if ( exists $stop_words{$word} ); ##counts individual words $words{$word}++; ##counts total words $total_words++; } ##stacks up the words into one big hash foreach my $word ( keys %words ) { my $count = $words{$word}; unless ( exists $words_by_frequency{$count} ) { $words_by_frequency{$count} = []; } push @{ $words_by_frequency{$count} }, $word; } unless ($total_words) { print $client "No words found in content\n"; } my $list; my $i = 1; ##sorts the hash by from largest to smallest foreach my $count ( sort { $b <=> $a } ( keys %words_by_frequency ) ) { ##the words at this count foreach my $word ( @{ $words_by_frequency{$count} } ) { ##percentage by devision my $percentage = $words{$word} / $total_words * 100; ##format the percentage $percentage =~ s/(\.\d\d).*/$1/; ##pick the ones to keep ##algo valume if ( ( $percentage > 0.3 ) or ( $i < 100 ) ) { $list .= "$word $percentage||"; $i++; } } } open( DB, ">>$database" ); flock DB, 2; print DB "${this_link}\\\\\\${title}\|\|\|${desc}\/\/\/${list}\n"; flock DB, 8; close(DB); print " sliced and diced\n" } ###this link end print "all done\n"; print '<a href="/">back to search index..</a>'; ###Done with the db build } #end rebuild section #####Parcer sub Parse_Form { my ( @pairs, @getpairs, $key, $value, $pair ); #print $query_string; @getpairs = split ( /&/, $query_string ); push ( @pairs, @getpairs ); foreach $pair (@pairs) { ( $key, $value ) = split ( /=/, $pair ); $key =~ tr/+/ /; $key =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; $value =~ tr/+/ /; $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; ##big strip out to take care of possible exploits $value =~ s/[\!\&;\`'\\\|"*?~<>^\(\)\[\]\{\}\$\n\r]//g; ##get rid of unneeded space $value =~ s/^\s//g; $value =~ s/\s+/ /g; if ( $formdata{$key} ) { $formdata{$key} .= ", $value"; } else { $formdata{$key} = $value; } } } ####Grab webpage sub page_grabber { my ( $host, $document, $remote, $error ); ##grab $url from parrent my $url = shift (@_); ###### ##split the domain from the page requested ###### if ( $url =~ /(.*?)\//i ) { ### catch everything up to the first / ##grab the domain for the call $host = $1; ##grab the page request if it is there $document = "/$'"; ## ok it's a host and document www.x.com/idx.htm } else { ##else just request the root of the domain $host = $url; ## nope, it's just a host $document = "/"; } ### #seperate port if there ### my $port; if ( $host =~ /(.*?)\:(\d+)/ ) { $host = $1; $port = $2; } else { $port = 80 } print "connecting to $host:$port .."; $remote = IO::Socket::INET->new( Proto => "tcp", Timeout => "5", PeerAddr => $host, PeerPort => "http($port)", ); my $content; if ($remote) { $remote->autoflush(1); print ">>flushed>>"; print $remote "GET $document HTTP/1.1" . $EOL; print $remote "Host: $host" . $EOL; print $remote 'User-Agent: Mozilla/5.0 Galeon/1.2.6 (X11; Linux i686; U;) Gecko/20020913 Debian/1.2.6-2' . $EOL; print $remote 'Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,text/css,*/*;q=0.1' . $EOL; print $remote "Connection: close" . $EOL; print $remote $EOL; print $remote $EOL; print " asked for $document.."; while (<$remote>) { $content .= $_; #print } close $remote; print "closed connection"; } else { print " cannot connect\n"; } return $content; }