MARCproxy0.2.pl

#!/bin/perl


my $legal= <<END

MARCproxy: an interactive screen scraper for copy cataloging
Copyright (c) 2004 Charles McCallum, mccalluc-yahoo-com

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation (but without the
requirement of distributing the full GPL with the source) 
either version 2 of the License, or (at your option) any 
later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

END
;

use HTTP::Daemon;
use HTTP::Status;
use HTTP::Headers;
require LWP::UserAgent;
use HTML::Entities;
use MARC::Record;
use MARC::Field;


use strict;

my $debug=1;
my $lasthtml; # content of the last HTTP::Response.

my $commandurl="http://0.0.0.0/";
# requests to this address should be intercepted by the proxy and the appropriate command run.
# note that it needs to look like a legit url: barewords will have '.com' appended to them by IE.

my $message=<<END
Right now the regexps are only tuned to work with the 
<a href="http://catalog.loc.gov/cgi-bin/Pwebrecon.cgi?DB=local&PAGE=First"
target="_blank">Library of Congress</a>. Also note that the proxy will not 
fetch files that it guesses are graphics (because it's slow enough as is),
so you might as well turn them off in your browser.
<p>Keep this window around and when you've brought up the MARC data that you want,
click 'grab', or click 'quit' to quit (duh).
<p>
<a href="${commandurl}legal">Copyright (C) 2004 Charles McCallum</a>
END
;

my $err=0;  # $err is set when an error occurs, and remains set until a higher
            # level in the program turns it off. $message should be used to accumulate
            # the text of error messages: it is cleared when the contents are 
            # inserted into a page (line breaks will be converted to <br>).
            
        
my $outputpath=$ARGV[0];
until(-w $outputpath) {
    print "File to save MARC output: ";
    chomp($outputpath=<>)
    };

my $d = new HTTP::Daemon LocalPort=>2512 || die "couldn't open port 2512\n";
# choice of port is arbitrary: there is an argument for going with the default,
# which is to randomly choose one.

my $response;

print $legal;

print "In your browser please set proxy server to: ", $d->url, "\n";
print "and then go to $commandurl\n";

my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;


# All this HTTP stuff is pretty much copied from the documentation of the modules.
# The outer loops handle the server side of things,
# and the inner portion is mostly the client side of things.

# As a server, we wait for the user ($c) to make a request of us.
# If we're not doing anything special, then we ($ua) just pretend to be the 
# user, and make the HTTP request on their behalf, and pass them back the result.

while (my $c = $d->accept) {                # returns an HTTP::Daemon::ClientConn object
                                            # (subclass of IO::Socket::INET)
        
  while (my $request = $c->get_request) {   # returns an HTTP::Request object
                                            # (subclass of HTTP::Message)
    print '>>>',$request->as_string() if $debug;
    # would probably be good to add in a check on the ip: being an open proxy
    # server is bad.
    
    if ($request->uri()=~/(gif)|(jpeg)|(jpg)$/i) {
        $c->send_error(RC_NOT_FOUND);
        print "*** error sent\n" if $debug;
        next };
    # since this server doesn't fork, we don't want to load images or anything else.
    # (a real proxy server would consist of multiple processes, so it could do more
    #  than one thing at a time.)
    
    # From the user's point of view, the interaction is like a CGI, but from
    # the inside it's different. $lasthtml holds the last document handled by the proxy, 
    # excluding refreshes to the command window.
    
    if ($request->uri()=~m|^$commandurl|i) {
        
        if ($request->uri()=~m|/quit$|i) {
            sendok($c,commandhtml("Local proxy server has quit. Turn off proxy server option in your browser."));
            die }
        
        elsif ($request->uri()=~m|/grab$|i) {
            $message='';
            my $marc=html2marc($lasthtml);
            if ($response and not $marc) {
                # if the html2marc failed, then try to follow a link to marc data
                $marc=html2marc(followmarclink($ua,$response))
                };
            marc2file($marc);
            # eventually, go to the marc data for the user
            warn "err:$err message:$message\n$lasthtml\n" if debug;
            if ($err) {
                sendok($c,commandhtml($message));
                $err=0
                }
            else {
                $message='Check that that data has imported correctly, either here or at the target.';
                $message.='<pre>'.$marc->as_formatted().'</pre>';
                sendok($c,commandhtml($message));
                }
            }
            
        elsif ($request->uri()=~m|/legal$|i) {
            sendok($c,commandhtml(text2html($legal)))
            }
            
        elsif ($request->uri()=~m|^$commandurl$|) {
            sendok($c,commandhtml($message));
            }
            
        else {
            sendok($c,commandhtml('Unrecognized command.'))
            };
        
        next # get the next request.
        };
            
    
    # if a command hasn't been issued, just get the document they requested.
    $response = $ua->request($request); # an HTTP::Response
                                        # (subclass of HTTP::Message) 

    print '<<<',$response->status_line(),"\n",$response->as_string() if $debug; 

    if ($response->content_type eq 'text/html'  ) {
        print "*** HTML received\n" if $debug;
        $lasthtml=$response->content();
        };
        
    $c->send_response($response);
    };
    
  $c->close;
  undef($c);
};


    
###########################
## proxy server routines ## 
###########################

sub sendok {

# sends a document to the client.
# first argument is the client object, second is what to send.
# if the second is tagged but not a full document,
#   wrap it.
# if it's not even tagged, html encode and tag it
#   (at present, this isn't used)

    my $c=shift; # should be a HTTP::Daemon::ClientConn object
    my $body=shift;
    my $html;
    if ($body!~m/<.{1,4}>/i) { # if the body isn't tagged
        $body=text2html($body)
        };
    $html=$body!~m/<html>/i?
        "<html><title>local window</title><body>$body</body></html>":
        $body;
    $c->send_response(
        HTTP::Response->new(
            200,    # "OK"
            '',     # a message
            HTTP::Headers->new(Content_Type => 'text/html'),
                    # HTTP header (probably optional)
            $html
            )
        )
    };
    
sub commandhtml {

# generates the inside of the <body> to refresh the command window.

    return commands('grab','quit').
        '<br>'.
        join('<br>',@_)
    };
    
sub commands {

# generates the html for the commands given as args.

    return "<center>".
        join ( ' ',
            map ( qq([<a href="$commandurl$_">$_</a>]), @_ ) ).
        "</center>\n"
    };
        
sub text2html {

# converts straight text to html.

    my $r=shift;
    encode_entities($r);
    $r=~s/\n\s+/<p>/g;
    return $r
    };



###########################################################
## main routine for getting data into a database or file ##
###########################################################

sub marc2file {
    my $marc=shift;

    unless ($marc) {
        $message="No importable data was found.\n";
        $err=1;
        return};
        
    # rather than just dumping to a file at this point, you could do something more creative.   
        
    unless (open FILE, ">>$outputpath") {
        $message="Couldn't open $outputpath to save data\n";
        $err=1;
        return};
    
    print FILE $marc->as_formatted();
    
    close FILE;
    
    };


###############################################
## subroutine for extracting data from html ##
###############################################


sub html2marc {

# given a string containing HTML, try to make sense of it and return a MARC::Record
# object. probably would be good to add in some error checking.

    $_ = shift;
    my $record=MARC::Record->new();

    # First check if there are <pre> tags:

    if (m/<pre[^>]*?>/si) {
        # if it uses <pre> formatting, life is easy.
        # (like the ASU catalog (innopac?) )

        # remove everything but the <pre> text
        s|^.*<pre[^>]*?>(.*)</pre[^>]*?>.*$|$1|si;
        
        # Remove everything before the first tag that begins with a zero.
        s/^.*?\D(0\d\d\D)/$1/s;
        
        # and then remove padding from each line.
        s/^\s*//mg;
        s/\s*$//mg;
        
        # merge lines that are split.
        s/\n(\D|\d\d\D|\d\d\d\d\D)/ $1/g;
        
        # unfortunately, innopac seems to strip the first subfield marker.
        # we'll assume it's 'a' and put it back

        s/^(.......)/$1|a/mg;
    

    }
    else {
        # life is not easy.         
        # (like LoC)
        
        # remove the top of the file
        s/^.*?\D(0\d\d\D)/$1/s;
    
        # remove newlines, since they're not meaningful, and we're going to want
        # to put them back in with meaning.
        s/\n//g;
    
        # Replace <td> with a space
        s/<td>/ /gsi;
    
        # Replace <tr> with \n and then strip all html tags.
        s/<tr>/\n/gsi;
        s/<[^>]*?>//gsi;
        
        # and then remove padding from each line.
        s/^\s*//mg;
        s/\s*$//mg;
        
        # remove the trailing text (each marc line begins with a digit)
        s/\n\D.*$//s;
    };
    
    decode_entities($_);

    for my $line (split /\n/g) {
    
        next unless $line;

        warn "$line\n" if $debug;

        # skip any bad lines that might have gotten through.
        next unless $line=~m/^\d\d\d\D/;

        # parse out the field.
        
        my $tag;
        my $i1; 
        my $i2;
        my $fieldtext;
        
        if ($line=~/^(00\d)\s+(.*)$/) {
            $tag=$1;
            $fieldtext=$2;
            $record->append_fields(
                MARC::Field->new( $tag, $fieldtext) )
            }
        elsif ($line=~m/^(\d\d\d)\s+(.)(.)\s+(.*)$/) {
            $tag = $1;
            $i1=$2 || ' '; # I had these two commented out earlier: why?
            $i2=$3 || ' ';
            $fieldtext = $4;
            
            my @subfieldlist;       
            # figure out the subfields
            for (split /\s*\|/, $fieldtext) {
                if (m/(.)\s*(.*\S)\s*/) {
                    push @subfieldlist , ($1,$2);
                    # note: not a LoL, as for Field->new()
                    }
                };
            warn join(':',$tag, $i1, $i2, @subfieldlist)."\n" if $debug;
            
            $record->append_fields(
                MARC::Field->new( $tag, $i1, $i2, @subfieldlist )
                )
            }
        };
        
    
    if ($record->fields()) {
        warn "returning ".scalar $record->fields()." fields\n" if $debug;
        return $record}
    else {
        return undef}
    
    };



##########################################################
## HTTP extras: routines for spidering and parsing HTML ##
##########################################################

sub followmarclink {

# given a user agent object and a response object
# looks for link to marc data in the content and follows it,
# returning the response.

    my $ua=shift;
    my $response=shift;

    return unless ref $response eq 'HTTP::Response';

    my $base=$response->base();
    $base=~s|^(http://[^/]*).*$|$1|;
    my $html=$response->content();
    
    
    warn "\n----> $& \n" if $debug;
    
    $html=~m/
        href="?         # quotes are optional
        ([^" ]*)        # the (perhaps relative) url
        [^>]*>          # rest of the <a> tag
        \s*<img[^>]*    # How do we know that this is the right link?
        alt="[^"]*marc  # because it contains "marc" in the alt field.
        /isx || warn "couln't find marc alt\n";
    
    warn "\n----> $& \n" if $debug; 
    
    my $url=$1;
    
    warn "\n----> $url \n" if $debug;
    warn "\nbase> $base \n" if $debug;
    
    $url=$base.$url if $url=~m{^/};
    
    
    warn "----> $url \n" if $debug;
    
    $response=$ua->request(HTTP::Request->new(GET=>$url));
    return $response->content();
    
    };