ablog

不器用で落着きのない技術者のメモ

HTML::AutoPagerize を使ってみた

HTML::AutoPagerize を使ってみた。
コードは HTML::AutoPagerize - Utility to load AutoPagerize SITEINFO stuff - metacpan.org からほぼコピペ。

#!/usr/bin/perl

use HTML::AutoPagerize;
use LWP::Simple;

my $autopager = HTML::AutoPagerize->new;
$autopager->add_site(
	url         => 'http://.+.tumblr.com/',
	nextLink    => '//div[@id="content" or @id="container"]/div[last()]/a[last()]',
	pageElement => '//div[@id="content" or @id="container"]/div[@class!="footer" or @class!="navigation"]',
);

my $uri  = 'http://otsune.tumblr.com/';
my $html = LWP::Simple::get($uri);

my $res = $autopager->handle($uri, $html);
if ($res) {
	my $next_link = $res->{next_link};    # URI object
	my $content   = $res->{page_element}; # XML::XPathEngine::NodeSet object. may be empty
	use Data::Dumper;
	print Dumper $next_link;
}
  • 実行結果
$ ./autopagerize_tumblr.pl
$VAR1 = bless( do{\(my $o = 'http://otsune.tumblr.com/page/2')}, 'URI::http' );