Bookmark Search

#!/usr/bin/perl -w ######################################## #Bookmark Crawler #Copyright John Andrews 2002 #email questions to grumpyoldman@users.sourceforge.net ########################################################################## # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ########################################################################### ### #***Windows version***Windows version***Windows version***Windows version ### ##Settings below ##There should be only three paths to set. ##your bookmark file, works with Mozilla, Opera, Netscape, Galeon ### my $bookmark_file = 'C:\WINDOWS\Application Data\Mozilla\Profiles\default\kg9p6pdg.slt\bookmarks.html'; ##port to work off of my $port = 6800; ##path and name of the database to make. my $database = 'C:\book.db'; ##end settings use strict; use IO::Socket; ## # ##check to see if the bookmarkfile exists my $is_bk = ""; unless ( -e $bookmark_file ) { $is_bk = 'the bookmark file missing at "' . $bookmark_file . '"
'; } ###rplaces "\r\n" -- more universal my $EOL = "\015\012"; ## a CR LF pair for the server my %formdata; my ( $client_info, $query_string, @word ); my $server = IO::Socket::INET->new( LocalPort => $port, Type => SOCK_STREAM, Reuse => 1, Listen => 10 ) or die "$@\n"; my $client; while ( $client = $server->accept() ) { while (<$client>) { if (m/GET\W*?\?(.+?)\s/i) { $query_string = $1; } last if /^\r\n$/; $client_info .= $_; } ##put conditional to keep -w happy if ($query_string) { &Parse_Form; } ## #the three lins below are redundent, but -w nagged me when I did it the short way my $palce_holder; if ( $formdata{'search'} ) { $palce_holder = "$formdata{'search'}" } else { $palce_holder = "" } ###Hold client info for something useful ##$client_stuff; select $client; $| = 1; print $client "HTTP/1.0 200 OK\r\n"; print $client "Connection: close\r\n"; print $client "Content-type: text/html\r\n\r\n"; ##print the front page print $client qq| Bookmark Search $is_bk

Bookmark Search

new search

Add new bookmarks?

Rebuild Database?
could take a very long time

|; ##may also seem redundent, but it is keeping -w happy my $last_search; if ( $formdata{'ls'} ) { $last_search = $formdata{'ls'} } else { $last_search = 0 } if ( $formdata{'rebuild'} ) { &rebuild; } ##print the output of a search if ( $formdata{'search'} ) { my $search = $formdata{'search'}; my @word = split ( / /, $search ); print $client ' '; ##quick count of words search for my $wc = 0; ##these lines are just to bold out the KWs being searched for foreach (@word) { print $client " $_ "; $wc++; if ( $wc && $wc < ( $#word + 1 ) ) { print $client "and"; } } print $client ' '; ##keep strict happy my ( @grabbed, $this_entry, $number, $list, %grabbed, $domain, %tmp, @search_nd_db ); #open the databade and put it into an array open( SEARCH, "$database" ); my @search_db = ; close(SEARCH); ##take out the dupes foreach my $rip_dupes (@search_db) { push @search_nd_db, $rip_dupes unless $tmp{$rip_dupes}++; } ##run through each record foreach $this_entry (@search_nd_db) { ##creat the bad match..this is here so that sites that match ## one word but not the rest do not get added my $bad_match = 0; ##keep strict happy, set up local variables my ( $this_url, $number, $info, $title, $this_desk ); ##the key word search foreach my $this_word (@word) { ##is the word in there? or in the title.. if ( $this_entry =~ m/\W$this_word\W|^$this_word\W/i ) { ##extract title if ( $this_entry =~ m/\\\\\\(.+?)\|\|\|/ ) { $title = $1 } else { $title = "---" } ##extract description if ( $this_entry =~ m/\|\|\|(.+?)\/\/\// ) { $this_desk = $1; } else { $this_desk = "---" } ##extract url if ( $this_entry =~ m/^(.+?)\\/ ) { $this_url = $1 } else { $this_entry = "---" } ##get the kw weight if ( $this_entry =~ m/\|$this_word\t(\d+\.\d+)/i ) { $number += $1; } ##give a point for title if ( $title =~ /$this_word/i ) { $number += 1 } ##give a point for description if ( $this_desk =~ /$this_word/i ) { $number += 0.5 } unless ($number) { $number = 0 } } ##kill this url if only partial match else { $bad_match = 1 } } ##more kill unless ($bad_match) { ## ##stack them $info = "$number\t$this_url\t$title\t$this_desk"; push @grabbed, $info; } } my @sorted = sort { $b cmp $a } @grabbed; my $total = $#sorted + 1; print $client "has $total results"; print $client '

'; my $count = 1; my $last_printed = 0; foreach $list (@sorted) { my @url_info = split ( /\t/, $list ); ##give a no score a 'o.oo' if ( $url_info[0] eq "" ) { $url_info[0] = "0.00" } if ( $last_search < $count && $count < ( $last_search + 11 ) ) { print $client '' . $url_info[2] . '
' . "\n"; print $client ' ' . "$count \ " . $url_info[3] . '
' . "\n"; print $client 'http://' . $url_info[1] . ' Weight =' . $url_info[0] . "

\n"; $last_printed = $count; } $count++; } print $client '

'; if ( $last_printed > 10 ) { ##set up the back botton my $prev_count = ( $last_search - 10 ); ##don't want it going past zerro if ( $prev_count < 0 ) { $prev_count = 0 } print $client '<--Back '; } if ( $last_printed < ( $#sorted + 1 ) ) { print $client 'Next-->'; } } print $client ''; print $client qq| |; ##added for windows because of the fork issue close($client); } continue { close($client); #kills hangs $query_string = 0; undef %formdata; } sub rebuild { ##wipe out old db if wipe out is selected, could use unlink here ##-- but I think it might be better not to, because of permission issuse if ( $formdata{'wipe_out'} ) { open( DB, ">$database" ); print DB ""; close(DB); } ##build list of bookmarfs aleady indexed open( DBPRE, "$database" ); my @here_already = ; close(DBPRE); my @old_url; ##grab urls in $database my $i; foreach (@here_already) { ( $old_url[ $i++ ] ) = split ( /\\\\\\/, $_ ); } ##make a quick hash from urls already indexed my %old_urls; foreach my $urls (@old_url) { $old_urls{$urls} = 1; } print $client "

";

    ###Stop words to be removed from the body
    my @stop_words = qw(
      at nbsp a an amp and are arial as be but by can com do font for from gif have
      he helvetica her his href i if in is it jpg net of on or org our sans ser she
      size that the this to us we with you your has such accordance suitable
      according having than all http herein also however their then another into there
      invention thereby therefore thereof its thereto because means these been
      not they being now those claim thus comprises onto corresponding use could
      other various described particularly was desired preferably were preferred what
      does present when each provide where embodiment provided whereby fig provides
      wherein figs relatively which respectively while said who further should will
      generally since had some would about above across adj after again against almost
      alone along although always am among any anybody anyone anything anywhere apart
      around aside away before behind below besides between beyond both cannot deep did
      doing done down downwards during either else enough etc even ever every everybody
      everyone except far few forth get gets got hardly here herself him himself how
      indeed instead inward itself just kept many maybe might mine more most mostly
      much must myself near neither next no nobody none nor nothing nowhere off often
      only others ought ours out outside over own p per please plus pp quite rather
      really s seem self selves several shall so somebody somewhat still theirs them themselves thorough
      thoroughly through together too toward towards under until up upon very well
      whatever whenever whether whom whose within without yet young yourself accordingly
      already apparently arise became briefly came certain certainly different due especially
      following found gave give given gives giving gone immediately keep largely like made
      mainly make mug nearly necessarily nos noted obtain obtained overall owing past
      perhaps please  possibly predominantly previously probably prompt promptly ready
      regardless same seen show showed shown shows similarly slightly sometime sometimes
      soon strongly substantially successfully though throughout unless used usefully
      using usually why widely www _
    );
    my %stop_words;

    ##Make a hash from the stop words
    foreach my $word (@stop_words) {
        $stop_words{$word} = 1;
    }
    open( DD, "$bookmark_file" );
    my @ww = ;
    close(DD);
    my ( $bmcontent, $link, @links, %tmp, @lines );
    foreach (@ww) { $bmcontent .= $_ }

    while ( $bmcontent =~ /(<\w*\s*\w*=.+)(http.*?)(\")/gi ) {
        $link = $2;
        $link =~ s/^"//;
        $link =~ s/".+//;

        push @links, $link unless $tmp{$link}++;

    }

    ##
    #adding Opera hack
    ##
    while ( $bmcontent =~ /(URL\=)(\S+)/g ) {
        $link = $2;
        push @links, $link unless $tmp{$link}++;
    }

    #@links = qw ( www.linux.com www-3.ibm.com);
    foreach my $this_link (@links) {

        ##skip self, causes a hang
        if ( $this_link =~ /127\.0\.0\.1\:$port|locolhost\:$port/ ) { next }

        #my $url = $this_link; 

        ##the page frabbing sub down below
        ##rip off the http:// IO doesn't use it
        $this_link =~ s/http:\/\///g;    ## take out the 'http://' out
                                         ###
             #Check to see that the url isn't already indexed
             ###
        if ( $old_urls{$this_link} ) { next }

        my $content = page_grabber($this_link);
        unless ($content) { next }

        ###
        #Redirect check
        ###
        if ( $content =~ /\015\012Location: http:\/\/(.*)\015\012/ ) {
            my $redirect = $1;
            $redirect =~ s/[\s]//g;
            undef $content;
            print "\nredirected to $redirect\n";
            $content = page_grabber($redirect);
            unless ($content) { next }
        }

        ###Strip out the headers, HTML and non words

        $content =~ m/(.+?)<\/title>/i;
        my $title = $1;
        unless ($title) { $title = '---' }
        $content =~ m/description\"\s*content="(.+?)"/i;
        my $desc = $1;
        unless ($desc) { $desc = '---' }

        $content =~ s/\n/ /g;
        $content =~ s/\r/ /g;
        $content =~ s/\t/ /g;
        $content =~ s/.+<body//gis;
        $content =~ s/<\/body.+//si;
        $content =~ s/<(.*?)>/ /gs;
        $content =~ s/[\W+]/ /gs;
        $content =~ s/\s\d\s/ /gs;
        $content =~ s/\&\w+\;/ /gs;

        ##removing of the stopwords
        #foreach  (@stop_words) { $content =~ s/\W$_\W/ /gi }

        ##splitting up of the content into words
        my @words = split /\s+/, $content;

        my $total_words = 0;    # the total number of words in the content
        my %words;              # a hash of all the words found and the freq.
        my %words_by_frequency;

        foreach my $word (@words) {

            ##switch to lower case 
            $word =~ tr/A-Z/a-z/;
            next if ( exists $stop_words{$word} );

            ##counts individual words
            $words{$word}++;

            ##counts total words  
            $total_words++;
        }

        ##stacks up the words into one big hash
        foreach my $word ( keys %words ) {
            my $count = $words{$word};
            unless ( exists $words_by_frequency{$count} ) {
                $words_by_frequency{$count} = [];
            }
            push @{ $words_by_frequency{$count} }, $word;
        }
        unless ($total_words) {
            print $client "No words found in content\n";

        }
        my $list;
        my $i = 1;

        ##sorts the hash by from largest to smallest
        foreach my $count ( sort { $b <=> $a } ( keys %words_by_frequency ) ) {

            ##the words at this count

            foreach my $word ( @{ $words_by_frequency{$count} } ) {

                ##percentage by devision
                my $percentage = $words{$word} / $total_words * 100;

                ##format the percentage	 
                $percentage =~ s/(\.\d\d).*/$1/;

                ##pick the ones to keep	
                ##algo valume
                if ( ( $percentage > 0.3 ) or ( $i < 100 ) ) {
                    $list .= "$word	$percentage||";
                    $i++;
                }

            }
        }

        open( DB, ">>$database" );
        flock DB, 2;
        print DB "${this_link}\\\\\\${title}\|\|\|$desc\/\/\/${list}\n";
        flock DB, 8;
        close(DB);
        print " sliced and diced\n"

    }    ###this link end
    print "all done\n";
    print '<a href="/">back to search index..</a>';

    ###Done with the db build
}    #end rebuild section

#####Parcer
sub Parse_Form {
    my ( @pairs, @getpairs, $key, $value, $pair );

    #print $query_string;
    @getpairs = split ( /&/, $query_string );
    push ( @pairs, @getpairs );

    foreach $pair (@pairs) {
        ( $key, $value ) = split ( /=/, $pair );
        $key   =~ tr/+/ /;
        $key   =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
        $value =~ tr/+/ /;
        $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
        $value =~ s/[\!\&;\`'\\\|"*?~<>^\(\)\[\]\{\}\$\n\r]//g;

        ##get rid of unneeded space
        $value =~ s/^\s//g;
        $value =~ s/\s+/ /g;

        if ( $formdata{$key} ) {
            $formdata{$key} .= ", $value";
        }
        else {
            $formdata{$key} = $value;
        }
    }
}

####Grab webpage
sub page_grabber {
    my ( $host, $document, $remote, $error );

    ##grab $url from parrent
    my $url = shift (@_);

    ######
    ##split the domain from the page requested
    ######
    if ( $url =~ /(.*?)\//i ) {    ### catch everything up to the first /
            ##grab the domain for the call
        $host = $1;

        ##grab the page request if it is there		
        $document = "/$'";    ## ok it's a host and document www.x.com/idx.htm
    }
    else {

        ##else just request the root of the domain
        $host     = $url;    ## nope, it's just a host
        $document = "/";
    }

    ###    
    #seperate port if there
    ###
    my $port;
    if ( $host =~ /(.*?)\:(\d+)/ ) {
        $host = $1;
        $port = $2;
    }
    else { $port = 80 }
    print "connecting to $host:$port ..";
    $remote = IO::Socket::INET->new(
        Proto    => "tcp",
        Timeout  => "5",
        PeerAddr => $host,
        PeerPort => "http($port)",
    );

    my $content;
    if ($remote) {
        $remote->autoflush(1);
        print ">>flushed>>";
        print $remote "GET $document HTTP/1.1" . $EOL;
        print $remote "Host: $host" . $EOL;
        print $remote
'User-Agent: Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.0.1) Gecko/20020823 Netscape/7.0'
          . $EOL;
        print $remote
'Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,text/css,*/*;q=0.1'
          . $EOL;
        print $remote "Connection: close" . $EOL;
        print $remote $EOL;
        print $remote $EOL;
        print " asked for $document..";

        while (<$remote>) {
            $content .= $_;

            #print
        }

        close $remote;
        print "closed connection";
    }
    else {
        print " cannot connect\n";
    }
    return $content;
}