From: Luke Reeves Date: Sun, 12 Jul 2009 15:48:09 +0000 (-0400) Subject: Fixed snapshot parsing and viewing X-Git-Tag: 2.1-alpha-2019-0-29~153 X-Git-Url: http://91.132.146.200/gitweb/?a=commitdiff_plain;h=cad54fbbad645b876cce326e60bf2816144bc62d;p=insipid.git Fixed snapshot parsing and viewing --- diff --git a/lib/Insipid/Parser.pm b/lib/Insipid/Parser.pm new file mode 100755 index 0000000..d9a41a3 --- /dev/null +++ b/lib/Insipid/Parser.pm @@ -0,0 +1,115 @@ +package Insipid::Parser; + +use HTML::Parser; +use HTML::Entities (); +use URI::URL; +use Digest::MD5 qw(md5 md5_hex); +use Insipid::Config; +use Insipid::Database; + +use vars qw(@ISA); +@ISA = qw(HTML::Parser); + +sub setSnapshotMap { + my($self, $ssMap) = (@_); + $self->{SSMAP} = $ssMap; +} + +sub new { + my $pack = shift; + my $self = $pack->SUPER::new; + @{$self}{qw(__base __grabit)} = @_; + $self; +} + +sub declaration { + my $self = shift; + my ($decl) = @_; +} + +sub start { + my $self = shift; + my ($tag, $attr, $attrseq, $origtext) = @_; + + if(!defined($self->{__grabit})) { + print("<$tag"); + } + + for (keys %$attr) { + my $val = $attr->{$_}; + if(($_ eq "/") && ($val = "/")) { next; } + + if(!defined($self->{__grabit})) { + print(" $_=\""); + } + + if( "$tag $_" =~ /^(link href|img src)$/i) { + $val = url($val)->abs($self->{__base},1); + + if(!defined($self->{__grabit})) { + if($val =~ /(\.gif|\.jpg|\.png|\.css)$/i) { + my $md5 = md5_hex("$val"); + $val = $snapshot_url . $md5; + } + } else { + # JPG, GIF, PNG and CSS + if($val =~ /(\.gif|\.jpg|\.png|\.css)$/i) { + join_urls($self->{__base}, $val); + $val = $self->{__grabit}($val, $1); + } + } + } + + if(!defined($self->{__grabit})) { + # Check against our snapshot map + if(($tag =~ /^a/i) && ($_ =~ /^href/i)) { + my $sst = $self->{SSMAP}; + + if(defined($sst->{$val})) { + print $snapshot_url . $sst->{$val}; + print('"'); + } else { + print("$val\""); + } + } else { + print("$val\""); + } + } + } + + if(!defined($self->{__grabit})) { print(">"); } +} + +sub end { + my $self = shift; + my ($tag) = @_; + + if(!defined($self->{__grabit})) { print(""); } +} + +sub text { + my $self = shift; + my ($text) = @_; + + if(!defined($self->{__grabit})) { print("$text"); } +} + +sub comment { + my $self = shift; + my ($comment) = @_; + + if(!defined($self->{__grabit})) { print(""); } +} + +sub join_urls { + my($parent, $child) = (@_); + my $sql = "insert into $tbl_pagecache_references(md5_parent, md5_child) values(?, ?)"; + my $sth = $dbh->prepare($sql); + $sth->execute(md5_hex($parent), md5_hex($child)); + if($sth->err) { + # ignore errors for now + } +} + +1; +__END__ diff --git a/lib/Insipid/Snapshots.pm b/lib/Insipid/Snapshots.pm index 3e2f567..9b802b4 100755 --- a/lib/Insipid/Snapshots.pm +++ b/lib/Insipid/Snapshots.pm @@ -28,6 +28,7 @@ use Insipid::Config; use Insipid::Database; use Insipid::Util; use Insipid::LinkExtractor; +use Insipid::Parser; use CGI qw/:standard/; use CGI::Carp qw(fatalsToBrowser); use Date::Format; @@ -228,7 +229,7 @@ sub show_snapshot { } print "\r\n"; - my $p = MyParser->new($row[2], undef); + my $p = Insipid::Parser->new($row[2], undef); $p->setSnapshotMap(\%internalLinks); if($row[0] =~ /utf/i) { @@ -476,7 +477,7 @@ sub do_snapshot { sub parsepage { my ($url, $content, $content_type) = (@_); - my $p = MyParser->new($url, \&fetch_url); + my $p = Insipid::Parser->new($url, \&fetch_url); if($content_type =~ /utf/i) { $p->utf8_mode(1); } @@ -484,122 +485,5 @@ sub parsepage { $p->parse($content); } -## "use MyParser;" ## TODO: Make this a separate file. -BEGIN { - package MyParser; - use HTML::Parser; - use HTML::Entities (); - use URI::URL; - use Digest::MD5 qw(md5 md5_hex); - use Insipid::Config; - use Insipid::Database; - - use vars qw(@ISA); - @ISA = qw(HTML::Parser); - - sub setSnapshotMap { - my($self, $ssMap) = (@_); - $self->{SSMAP} = $ssMap; - } - - sub new { - my $pack = shift; - my $self = $pack->SUPER::new; - @{$self}{qw(__base __grabit)} = @_; - $self; - } - - sub declaration { - my $self = shift; - my ($decl) = @_; - } - - sub start { - my $self = shift; - my ($tag, $attr, $attrseq, $origtext) = @_; - - if(!defined($self->{__grabit})) { - print("<$tag"); - } - - for (keys %$attr) { - my $val = $attr->{$_}; - if(($_ eq "/") && ($val = "/")) { next; } - - if(!defined($self->{__grabit})) { - print(" $_=\""); - } - - if( "$tag $_" =~ /^(link href|img src)$/i) { - $val = url($val)->abs($self->{__base},1); - - if(!defined($self->{__grabit})) { - if($val =~ /(\.gif|\.jpg|\.png|\.css)$/i) { - my $md5 = md5_hex("$val"); - $val = $snapshot_url . $md5; - } - } else { - # JPG, GIF, PNG and CSS - if($val =~ /(\.gif|\.jpg|\.png|\.css)$/i) { - join_urls($self->{__base}, $val); - $val = $self->{__grabit}($val, $1); - } - } - } - - if(!defined($self->{__grabit})) { - # Check against our snapshot map - if(($tag =~ /^a/i) && ($_ =~ /^href/i)) { - my $sst = $self->{SSMAP}; - - if(defined($sst->{$val})) { - print $snapshot_url . $sst->{$val}; - print('"'); - } else { - print("$val\""); - } - } else { - print("$val\""); - } - } - } - - if(!defined($self->{__grabit})) { print(">"); } - } - - sub end { - my $self = shift; - my ($tag) = @_; - - if(!defined($self->{__grabit})) { print(""); } - } - - sub text { - my $self = shift; - my ($text) = @_; - - if(!defined($self->{__grabit})) { print("$text"); } - } - - sub comment { - my $self = shift; - my ($comment) = @_; - - if(!defined($self->{__grabit})) { print(""); } - } - - sub join_urls { - my($parent, $child) = (@_); - my $sql = "insert into $tbl_pagecache_references(md5_parent, md5_child) values(?, ?)"; - my $sth = $dbh->prepare($sql); - $sth->execute(md5_hex($parent), md5_hex($child)); - if($sth->err) { - # ignore errors for now - } - } - -} -## end "use MyParser;" ## - 1; __END__