#!/usr/bin/perl -w
#
# ローカルに保存したHTMLファイルから頻出単語を調べる
#
# Jcode.pm, 茶筅(ChaSen)が必要
#
# 使用法: $0 [-r] [-l] [-d] [-n #] [-l [-u #]] [-j 文字コード] [-s 検索エンジン] ファイルまたはディレクトリ...
#
# オプション:
# -r: HTMLの代わりに単語統計ファイルを読み込む
# -d: 単語を含む文書の数で計算する
# -n: Nグラム
# -l: 単語統計ファイルを出力する
# -u: 表示する頻度の下限
# -j: 文字コードの指定
# -s: 検索エンジンの指定(が文字列に置換される)
#
# 文字コード指定
# utf8, sjis, euc, jis
#
# 2004-07-29 0.1-beta9 検索エンジンの指定
# 2004-07-23 0.1-beta8 文書の点数に、文書のサイズを加味する
# 2004-07-23 0.1-beta7 文書の点数に、文書のサイズを加味する
# 2004-07-18 0.1-beta6 文字コード変換の修正
# 2004-07-18 0.1-beta5 自分の興味に近い文書を表示する機能
# 2004-07-11 0.1-beta4 バイグラム
# 2004-07-11 0.1-beta3 単語を含む文書の数で計算する
# 2004-07-11 0.1-beta2 再帰オプションなど
# 2004-07-10 0.1-beta1
#
# Copyright (c) 2004 Satoshi Fukutomi .
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
use strict;
use IPC::Open2;
use Jcode;
use vars qw($JCODE $SEARCH $NGRAM $LIMIT_STRING);
$JCODE = "";
$NGRAM = 1;
$LIMIT_STRING = 7*1024;
$SEARCH = "http://www.google.co.jp/search?q=&ie=EUC-JP&oe=EUC-JP&hl=ja&btnG=Google+%8C%9F%8D%F5&lr=lang_ja&num=100";
my $TYPE = "名詞|未知語";
my $IGNORE = '^.$|^..$|^&|[=<>]|' . "^[\0-\100\133-\140\176-\177]*\$";
my $IGNORE_TYPE = "代名詞|非自立|接尾";
my $UNDER = 1;
my $DOCS_MODE = 0;
my $READ_LIST = 0;
my $WRITE_LIST = 0;
my @file = ();
while (@ARGV) {
$_ = shift @ARGV;
if ($_ eq "-u") {
$UNDER = shift @ARGV;
} elsif ($_ eq "-r") {
$READ_LIST = 1;
} elsif ($_ eq "-l") {
$WRITE_LIST = 1;
} elsif ($_ eq "-d") {
$DOCS_MODE = 1;
} elsif ($_ eq "-n") {
$NGRAM = shift @ARGV;
} elsif ($_ eq "-j") {
$JCODE = shift @ARGV;
} elsif ($_ eq "-s") {
$SEARCH = shift @ARGV;
} else {
push @file, $_;
}
}
die "usage: $0 [-r] [-d] [-n #] [-l [-u #]] [-j code] file...\n" unless (@file);
use vars qw($CHASEN_IN $CHASEN_OUT);
open2($CHASEN_IN, $CHASEN_OUT, "chasen");
#
# ファイルのリスト
#
sub files($);
sub files($) {
my($file) = @_;
if (-f $file) {
return $file;
} elsif (-d $file) {
local $_;
my @file = ();
foreach (glob "$file/*") {
push @file, files($_);
}
return @file;
}
}
#
# HTMLをテキストに変換(タグは飛ばす)
#
sub html2text($@) {
my($xjcode, @buf) = @_;
my $jcode = "";
local $_;
my @buf2 = ();
foreach (@buf) {
if (($jcode eq "") && $_ && (/charset=([^"'\s]+)/i)) {
$jcode = $1;
}
s/<[^>]*>/ /g;
s/\s+/ /g;
push @buf2, $_;
}
$_ = join "\n", @buf2;
if ($jcode =~ /UTF-?8/i) {
$_ = Jcode->new($_, "utf8")->euc;
} elsif ($jcode =~ /SHIFT_JIS/i) {
$_ = Jcode->new($_, "sjis")->euc;
} elsif ($jcode =~ /EUC-JP/i) {
# $_ = Jcode->new($_, "euc")->euc;
} elsif ($jcode =~ /ISO-2022-JP/i) {
$_ = Jcode->new($_, "jis")->euc;
} elsif ($xjcode) {
$_ = Jcode->new($_, $xjcode)->euc;
} else {
$_ = Jcode->new($_)->euc;
}
return $_;
}
#
# ファイルを読む(タグは飛ばす)
#
sub readHTML($) {
my($file) = @_;
my($IN);
local $_;
my $jcode = "";
open $IN, $file or return ();
my @buf = <$IN>;
close $IN;
return html2text($JCODE, @buf);
}
#
# 単語頻度
#
sub freq(@) {
my @file = @_;
my %freq = ();
foreach (@file) {
warn "reading $_\n";
my %freq_file = ();
local $_ = readHTML($_);
my @buf = split /\s+/, $_;
my @prev = ();
foreach (@buf) {
$_ = substr $_, 0, $LIMIT_STRING;
print $CHASEN_OUT $_, "\n";
do {
$_ = <$CHASEN_IN>;
chomp;
@_ = split;
if ((defined $_[2]) && ($_[2] =~ /$IGNORE/)) {
# nop
} elsif ($_[3] && ($_[3] =~ /$IGNORE_TYPE/)) {
# nop
} elsif ($_[3] && ($_[3] =~ /$TYPE/)) {
my $key = $_[2];
my $tmp = $key;
$key = "@prev $key";
push @prev, $tmp;
shift @prev while (@prev >= $NGRAM);
if ($DOCS_MODE && (! $freq_file{$key})) {
$freq_file{$key}++;
$freq{$key}++;
} elsif (! $DOCS_MODE) {
$freq{$key}++;
}
}
} while ($_ && ($_ ne "EOS"));
}
}
return %freq;
}
#
# リストの読み込み
#
sub readList(@) {
my @file = @_;
local $_;
my %freq = ();
my($IN);
foreach (@file) {
open $IN, $_ or next;
while (<$IN>) {
chomp;
@_ = split /: /, $_, 2;
next if (@_ < 2);
$_[0] =~ s/\s//g;
$freq{$_[1]} = $_[0];
}
}
return %freq;
}
#
# wget
#
sub wget($) {
my($uri) = @_;
my($IN, $OUT);
open2($IN, $OUT, "wget", "-i", "-", "-O", "-",
"-U", "Mozilla/4.0 (Webtf)", "-T", 10, "-t", 2);
print $OUT $uri, "\n";
close $OUT;
my @buf = <$IN>;
close $IN;
wait;
return @buf;
}
#
# 文書の点数付け
#
sub score($%) {
my($uri, %freq) = @_;
my @buf = wget($uri);
local $_ = html2text("", @buf);
@buf = split /\s+/, $_;
my $score = 0;
my @prev = ();
my %freq_file = ();
my $size = 0;
foreach (@buf) {
$_ = substr $_, 0, $LIMIT_STRING;
$size += length $_;
print $CHASEN_OUT $_, "\n";
do {
$_ = <$CHASEN_IN>;
chomp;
@_ = split;
if ((defined $_[2]) && ($_[2] =~ /$IGNORE/)) {
# nop
} elsif ($_[3] && ($_[3] =~ /$IGNORE_TYPE/)) {
# nop
} elsif ($_[3] && ($_[3] =~ /$TYPE/)) {
my $key = $_[2];
my $tmp = $key;
$key = "@prev $key";
push @prev, $tmp;
shift @prev while (@prev >= $NGRAM);
if ($freq{$key} && $DOCS_MODE && (! $freq_file{$key})) {
$freq_file{$key}++;
$score += $freq{$key};
} elsif ($freq{$key} && (! $DOCS_MODE)) {
$score += $freq{$key};
}
}
} while ($_ && ($_ ne "EOS"));
}
return $score / sqrt($size+1);
}
#
# URLエンコード
#
sub encode($) {
my($s) = @_;
$s =~ s|[^\w]|'%' . uc(unpack('H2', $&))|eg;
return $s;
}
#
# 検索
#
sub search($) {
my($key) = @_;
$key =~ s/^\s+//;
$key = encode($key);
my $uri = $SEARCH;
$uri =~ s//$key/g;
my @buf = wget($uri);
my @uri = ();
local $_;
foreach (@buf) {
while (/href=["']?([^"'>]+)/) {
push @uri, $1;
$_ = $';
}
}
@uri = grep $_ =~ /^http/, @uri;
@uri = grep $_ !~ /google|rss|rdf|pdf/, @uri;
return @uri;
}
#
# メイン
#
my @file2 = ();
foreach (@file) {
push @file2, files($_);
}
@file = @file2;
my %freq;
if ($READ_LIST) {
%freq = readList(@file);
} else {
%freq = freq(@file);
}
my @key = keys %freq;
@key = sort {$freq{$b} <=> $freq{$a}} @key;
if ($WRITE_LIST) {
foreach (@key) {
printf "%7d: %s\n", $freq{$_}, $_ if ($freq{$_} >= $UNDER);
}
exit;
} elsif (@key == 0) {
exit;
}
my @uri = search($key[0]);
my %score = ();
foreach (@uri) {
$score{$_} = score($_, %freq);
}
@uri = sort {$score{$b} <=> $score{$a}} @uri;
close $CHASEN_IN;
close $CHASEN_OUT;
wait;
print "webtf\n";
foreach (@uri) {
printf "- %7d: %s
\n", $score{$_}, $_, $_;
}
print "
\n";