--- /dev/null
+package Insipid::Parser;\r
+\r
+use HTML::Parser;\r
+use HTML::Entities ();\r
+use URI::URL;\r
+use Digest::MD5 qw(md5 md5_hex);\r
+use Insipid::Config;\r
+use Insipid::Database;\r
+\r
+use vars qw(@ISA);\r
+@ISA = qw(HTML::Parser);\r
+\r
+sub setSnapshotMap {\r
+ my($self, $ssMap) = (@_);\r
+ $self->{SSMAP} = $ssMap;\r
+}\r
+\r
+sub new {\r
+ my $pack = shift;\r
+ my $self = $pack->SUPER::new;\r
+ @{$self}{qw(__base __grabit)} = @_;\r
+ $self;\r
+}\r
+\r
+sub declaration {\r
+ my $self = shift;\r
+ my ($decl) = @_;\r
+}\r
+\r
+sub start {\r
+ my $self = shift;\r
+ my ($tag, $attr, $attrseq, $origtext) = @_;\r
+\r
+ if(!defined($self->{__grabit})) {\r
+ print("<$tag");\r
+ }\r
+\r
+ for (keys %$attr) {\r
+ my $val = $attr->{$_};\r
+ if(($_ eq "/") && ($val = "/")) { next; }\r
+\r
+ if(!defined($self->{__grabit})) { \r
+ print(" $_=\""); \r
+ }\r
+\r
+ if( "$tag $_" =~ /^(link href|img src)$/i) {\r
+ $val = url($val)->abs($self->{__base},1);\r
+\r
+ if(!defined($self->{__grabit})) {\r
+ if($val =~ /(\.gif|\.jpg|\.png|\.css)$/i) {\r
+ my $md5 = md5_hex("$val");\r
+ $val = $snapshot_url . $md5;\r
+ }\r
+ } else {\r
+ # JPG, GIF, PNG and CSS\r
+ if($val =~ /(\.gif|\.jpg|\.png|\.css)$/i) {\r
+ join_urls($self->{__base}, $val);\r
+ $val = $self->{__grabit}($val, $1);\r
+ }\r
+ }\r
+ }\r
+\r
+ if(!defined($self->{__grabit})) {\r
+ # Check against our snapshot map\r
+ if(($tag =~ /^a/i) && ($_ =~ /^href/i)) {\r
+ my $sst = $self->{SSMAP};\r
+\r
+ if(defined($sst->{$val})) {\r
+ print $snapshot_url . $sst->{$val};\r
+ print('"');\r
+ } else {\r
+ print("$val\"");\r
+ }\r
+ } else {\r
+ print("$val\"");\r
+ }\r
+ }\r
+ }\r
+\r
+ if(!defined($self->{__grabit})) { print(">"); }\r
+}\r
+\r
+sub end {\r
+ my $self = shift;\r
+ my ($tag) = @_;\r
+\r
+ if(!defined($self->{__grabit})) { print("</$tag>"); }\r
+}\r
+\r
+sub text {\r
+ my $self = shift;\r
+ my ($text) = @_;\r
+\r
+ if(!defined($self->{__grabit})) { print("$text"); }\r
+}\r
+\r
+sub comment {\r
+ my $self = shift;\r
+ my ($comment) = @_;\r
+\r
+ if(!defined($self->{__grabit})) { print("<!-- $comment -->"); }\r
+}\r
+\r
+sub join_urls {\r
+ my($parent, $child) = (@_);\r
+ my $sql = "insert into $tbl_pagecache_references(md5_parent, md5_child) values(?, ?)";\r
+ my $sth = $dbh->prepare($sql);\r
+ $sth->execute(md5_hex($parent), md5_hex($child));\r
+ if($sth->err) {\r
+ # ignore errors for now\r
+ }\r
+}\r
+\r
+1;\r
+__END__\r
use Insipid::Database;\r
use Insipid::Util;\r
use Insipid::LinkExtractor;\r
+use Insipid::Parser;\r
use CGI qw/:standard/;\r
use CGI::Carp qw(fatalsToBrowser);\r
use Date::Format;\r
}\r
\r
print "\r\n";\r
- my $p = MyParser->new($row[2], undef);\r
+ my $p = Insipid::Parser->new($row[2], undef);\r
$p->setSnapshotMap(\%internalLinks);\r
\r
if($row[0] =~ /utf/i) {\r
sub parsepage {\r
my ($url, $content, $content_type) = (@_);\r
\r
- my $p = MyParser->new($url, \&fetch_url);\r
+ my $p = Insipid::Parser->new($url, \&fetch_url);\r
if($content_type =~ /utf/i) { \r
$p->utf8_mode(1);\r
}\r
$p->parse($content);\r
}\r
\r
-## "use MyParser;" ## TODO: Make this a separate file.\r
-BEGIN {\r
- package MyParser;\r
- use HTML::Parser;\r
- use HTML::Entities ();\r
- use URI::URL;\r
- use Digest::MD5 qw(md5 md5_hex);\r
- use Insipid::Config;\r
- use Insipid::Database;\r
-\r
- use vars qw(@ISA);\r
- @ISA = qw(HTML::Parser);\r
-\r
- sub setSnapshotMap {\r
- my($self, $ssMap) = (@_);\r
- $self->{SSMAP} = $ssMap;\r
- }\r
-\r
- sub new {\r
- my $pack = shift;\r
- my $self = $pack->SUPER::new;\r
- @{$self}{qw(__base __grabit)} = @_;\r
- $self;\r
- }\r
-\r
- sub declaration {\r
- my $self = shift;\r
- my ($decl) = @_;\r
- }\r
-\r
- sub start {\r
- my $self = shift;\r
- my ($tag, $attr, $attrseq, $origtext) = @_;\r
-\r
- if(!defined($self->{__grabit})) {\r
- print("<$tag");\r
- }\r
-\r
- for (keys %$attr) {\r
- my $val = $attr->{$_};\r
- if(($_ eq "/") && ($val = "/")) { next; }\r
-\r
- if(!defined($self->{__grabit})) { \r
- print(" $_=\""); \r
- }\r
-\r
- if( "$tag $_" =~ /^(link href|img src)$/i) {\r
- $val = url($val)->abs($self->{__base},1);\r
- \r
- if(!defined($self->{__grabit})) {\r
- if($val =~ /(\.gif|\.jpg|\.png|\.css)$/i) {\r
- my $md5 = md5_hex("$val");\r
- $val = $snapshot_url . $md5;\r
- }\r
- } else {\r
- # JPG, GIF, PNG and CSS\r
- if($val =~ /(\.gif|\.jpg|\.png|\.css)$/i) {\r
- join_urls($self->{__base}, $val);\r
- $val = $self->{__grabit}($val, $1);\r
- }\r
- }\r
- }\r
-\r
- if(!defined($self->{__grabit})) {\r
- # Check against our snapshot map\r
- if(($tag =~ /^a/i) && ($_ =~ /^href/i)) {\r
- my $sst = $self->{SSMAP};\r
-\r
- if(defined($sst->{$val})) {\r
- print $snapshot_url . $sst->{$val};\r
- print('"');\r
- } else {\r
- print("$val\"");\r
- }\r
- } else {\r
- print("$val\"");\r
- }\r
- }\r
- }\r
-\r
- if(!defined($self->{__grabit})) { print(">"); }\r
- }\r
-\r
- sub end {\r
- my $self = shift;\r
- my ($tag) = @_;\r
-\r
- if(!defined($self->{__grabit})) { print("</$tag>"); }\r
- }\r
-\r
- sub text {\r
- my $self = shift;\r
- my ($text) = @_;\r
-\r
- if(!defined($self->{__grabit})) { print("$text"); }\r
- }\r
-\r
- sub comment {\r
- my $self = shift;\r
- my ($comment) = @_;\r
-\r
- if(!defined($self->{__grabit})) { print("<!-- $comment -->"); }\r
- }\r
- \r
- sub join_urls {\r
- my($parent, $child) = (@_);\r
- my $sql = "insert into $tbl_pagecache_references(md5_parent, md5_child) values(?, ?)";\r
- my $sth = $dbh->prepare($sql);\r
- $sth->execute(md5_hex($parent), md5_hex($child));\r
- if($sth->err) {\r
- # ignore errors for now\r
- }\r
- }\r
-\r
-}\r
-## end "use MyParser;" ##\r
-\r
1;\r
__END__\r