ablog

不器用で落着きのない技術者のメモ

WWW::Mechanize と Web::Scraper を使って HTML からテキストを抽出して CSV を作成する Perl スクリプト

ちょっと書いてみた Perl スクリプトをメモっておく。

#!/usr/bin/env perl
use strict;
use warnings;
use WWW::Mechanize;
use Web::Scraper;
use utf8;
use Encode;

my $mech = 'WWW::Mechanize'->new('autocheck', 1);
$mech->get('http://www.ablog.co.jp/login');
$mech->submit_form('fields', {'name', 'foo', 'password', 'bar'});
$mech->get('http://www.ablog.co.jp/contract');
$mech->submit_form();

my $scraper = scraper {
	use utf8;
	process '/html/body/div/div[3]/table/tbody/tr',
	'list[]' => scraper {
		process '//td[3]', contract => 'TEXT';
		process '//td[4]', status => 'TEXT';
		process '//td[5]', expired => 'TEXT';
		process '//td[6]', customer => 'TEXT';
		process '//td[7]', group => 'TEXT';
		process '//td[8]', permission => 'TEXT';
		process '//td[9]', notes => 'TEXT';
	}
};

my $res = $scraper->scrape($mech->content);
for my $list (@{ $res->{list} }) {
	for (values(%$list)) {
		s/\s+//sg;
	}
}

&write_csv("contract.csv", $res->{list});

sub write_csv {
	my ($name, $data) = @_;

	open my $csv, ">:encoding(utf8)", $name or die $!;
	for my $code (@$data) {
		printf $csv qq{"%s","%s","%s","%s","%s","%s"\n}, @{$code}{qw/contract status expired customer group permission/};
	}
	close $csv;
}

__END__


追記(2010/03/24):

s/\s+//sg; のあたりはWeb::Scraperのfilterでやったほうが手っ取り早いかも?:
process '//td[3]', contract => ['TEXT', sub {s/\s+//sg;} ];

WWW::Mechanize と Web::Scraper を使って HTML からテキストを抽出して CSV を作成する Perl スクリプト - ablog

id:otsune さんにコメントを頂いたので、教えて頂いた書き方で書きなおしてみた。

#!/usr/bin/env perl
use strict;
use warnings;
use WWW::Mechanize;
use Web::Scraper;
use utf8;
use Encode;

my $mech = 'WWW::Mechanize'->new('autocheck', 1);
$mech->get('http://www.ablog.co.jp/login');
$mech->submit_form('fields', {'name', 'foo', 'password', 'bar'});
$mech->get('http://www.ablog.co.jp/contract');
$mech->submit_form();

my $scraper = scraper {
	use utf8;
	process '/html/body/div/div[3]/table/tbody/tr',
	'list[]' => scraper {
		process '//td[3]', contract => ['TEXT', sub {s/\s+//sg;} ];
		process '//td[4]', status => ['TEXT', sub {s/\s+//sg;} ];
		process '//td[5]', expired => ['TEXT', sub {s/\s+//sg;} ];
		process '//td[6]', customer => ['TEXT', sub {s/\s+//sg;} ];
		process '//td[7]', group => ['TEXT', sub {s/\s+//sg;} ];
		process '//td[8]', permission => ['TEXT', sub {s/\s+//sg;} ];
		process '//td[9]', notes => ['TEXT', sub {s/\s+//sg;} ];
	}
};

my $res = $scraper->scrape($mech->content);
&write_csv("contract.csv", $res->{list});

sub write_csv {
	my ($name, $data) = @_;

	open my $csv, ">:encoding(utf8)", $name or die $!;
	for my $code (@$data) {
		printf $csv qq{"%s","%s","%s","%s","%s","%s"\n}, @{$code}{qw/contract status expired customer group permission/};
	}
	close $csv;
}

__END__