Example: Extracting Linksfrom a Bookmark File (Perl & LWP)

Example 6-4. bookmark-checker

#!/usr/bin/perl -w
# bookmark-checker - check URLs in Netscape bookmark file

use strict;
use LWP;
my $browser;
my $bmk_file = $ARGV[0]
  || 'c:/Program Files/Netscape/users/sburke/bookmark.htm';
open(BMK, "<$bmk_file") or die "Can't read-open $bmk_file: $!";

while (<BMK>) {
  check_url($1) if m/ HREF="([^"\s]+)" /;
}

print "# Done after ", time - $^T, "s\n";
exit;

my %seen;  # for tracking which URLs we've already checked

sub check_url {
  # Try to fetch the page and report failure if it can't be found
  # This routine even specially reports if the URL has changed
  # to be on a different host.

  my $url = URI->new( $_[0] )->canonical;

  # Skip mailto: links, and in fact anything not http:...
  return unless $url->scheme( ) eq 'http';

  # Kill anything like '#staff' in 'http://luddites.int/them.txt#staff'
  $url->fragment(undef);

  # Kill anything like the currently quite useless but
  # occasionally occurring 'jschmo@' in
  #  'http://[email protected]/them.txt'
  # (It's useless because it doesn't actually show up
  # in the request to the server in any way.)
  $url->userinfo(undef);
 
  return if $seen{$url};  # silently skip duplicates
  $seen{$url} = 1;  

  init_browser( ) unless $browser;
  my $response = $browser->head($url);
  my $found = URI->new( $response->request->url )->canonical;
  $seen{$found} = 1; # so we don't check it later.

  # If the server complains that it doesn't understand "HEAD",
  #  (405 is "Method Not Allowed"), then retry it with "GET":
  $response = $browser->get($found) if $response->code == 405;

  if($found ne $url) {
    if($response->is_success) {
      # Report the move, only if it's a very different URL.
      # That is, different schemes, or different hosts.
      if(
        $found->scheme ne $url->scheme
       or
        lc( $found->can('host') ? $found->host : '' )
         ne
        lc(   $url->can('host') ?   $url->host : '' )
      ) {
        print "MOVED: $url\n    -> $found\n",
      }

    } else {
      print "MOVED: $url\n    -> $found\n",
        "       but that new URL is bad: ",
        $response->status_line( ), "\n"
    }
  } elsif($response->is_success) {
    print "## okay: $url\n";
  } else {
    print "$url is bad! ", $response->status_line, "\n";
  }
  return;
}

sub init_browser {
  $browser = LWP::UserAgent->new;

  # Speak only HTTP - no mailto or FTP or anything.
  $browser->protocols_allowed( [ 'http' ] );

  # And any other initialization we might need to do.

  return $browser;
}

And for this rigidly formatted input file, our line-at-a-time regexp-based approach works just fine; our simple loop:


6.4. When Regular Expressions Aren't Enough		6.6. Example: Extracting Linksfrom Arbitrary HTML

6.5. Example: Extracting Linksfrom a Bookmark File

Example 6-4. bookmark-checker