#!/usr/bin/perl -w use HTML::Parser; use URI; use strict; use warnings; use Carp; use Data::Dumper; my $base=URI->new($ARGV[1]); my %seen; my @todo; my %im_neg; my $current_array; # pull title out of an HTML page, by parsing it; end title # handler used to pull parsing up short sub file_title { my ($filename) = @_; if($filename eq "site_map.html") { return "(you are here)"; } my $p = HTML::Parser->new(api_version => 3, start_h => [\&title_start_handler, "self,tagname,attr"], report_tags => [qw(title)], ); $p->parse_file($filename) || die "$filename $!"; my $a = $p->handler("text"); my $title = join("", @$a); $title =~ s/^\s+//; $title =~ s/\s+$//; $title =~ s/\s+/ /g; return $title; } sub title_start_handler { my($self, $tag, $attr) = @_; $self->handler(text => [], '@{dtext}' ); # accumulate text params into anon array $self->handler(end => \&title_end_handler, "self"); # install the killer! } sub title_end_handler { my($self) = @_; $self->eof(); } sub resolve { my ($path) = @_; return URI->new($path)->rel($base)->as_string(); } sub path_is_abs { ($_) = @_; return m@/@ || m/:/; } sub local_not_html { my($path) = @_; my $thing = { title => $path, path => $path, array => [], }; push @$current_array , $thing; } sub a_start_handler { my($self, $tag, $attr) = @_; my($path); if($tag eq "base") { print "BASE $attr->{href}\n"; $base = URI->new($attr->{href}); } if($tag eq "a" && defined($attr->{href})) { $path = resolve($attr->{href}); if(!path_is_abs($path)) { #local page, on this site if($path !~ s/#.*$//g) { # suppress intra page anchors if($path =~ /\.html$/) { if($path ne "index.html") { my $thing = { title => file_title($path), path => $path, array => [], }; push @$current_array , $thing; if(!exists($seen{$path})) { push @todo, $thing; } } } else { # local file, probably JPG or PDF local_not_html($path); } } } } if($tag eq "img") { $path = resolve($attr->{src}); # only link local images if(!path_is_abs($path) && $path !~ "thumb\." && !exists($im_neg{$path})) { local_not_html($path); } } } sub do_file { my ($filename) = @_; if(!exists($seen{$filename})) { $seen{$filename} = 1; my $p = HTML::Parser->new(api_version => 3, report_tags => [qw(base a img)], ); $p->handler(start => \&a_start_handler, "self,tagname,attr"); $p->parse_file($filename) || die $!; } } sub root { my ($start) = @_; my $start_thing = { path => $start, array => [], }; push @todo, $start_thing; # classic breadth first algorithm, builds up # data against each page while(scalar(@todo) != 0) { my $thing = shift @todo; $current_array = $thing->{array}; do_file($thing->{path}); } # print Dumper($start_thing); return $start_thing; } # walks data built up sub gen_html { my ($root) = @_; my $a = $root->{array}; if(scalar(@$a) != 0) { printf("