webtf

#!/usr/bin/perl -w # # ローカルに保存したHTMLファイルから頻出単語を調べる # # Jcode.pm, 茶筅(ChaSen)が必要 # # 使用法: $0 [-r] [-l] [-d] [-n #] [-l [-u #]] [-j 文字コード] [-s 検索エンジン] ファイルまたはディレクトリ... # # オプション: # -r: HTMLの代わりに単語統計ファイルを読み込む # -d: 単語を含む文書の数で計算する # -n: Nグラム # -l: 単語統計ファイルを出力する # -u: 表示する頻度の下限 # -j: 文字コードの指定 # -s: 検索エンジンの指定(が文字列に置換される) # # 文字コード指定 # utf8, sjis, euc, jis # # 2004-07-29 0.1-beta9 検索エンジンの指定 # 2004-07-23 0.1-beta8 文書の点数に、文書のサイズを加味する # 2004-07-23 0.1-beta7 文書の点数に、文書のサイズを加味する # 2004-07-18 0.1-beta6 文字コード変換の修正 # 2004-07-18 0.1-beta5 自分の興味に近い文書を表示する機能 # 2004-07-11 0.1-beta4 バイグラム # 2004-07-11 0.1-beta3 単語を含む文書の数で計算する # 2004-07-11 0.1-beta2 再帰オプションなど # 2004-07-10 0.1-beta1 # # Copyright (c) 2004 Satoshi Fukutomi . # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # use strict; use IPC::Open2; use Jcode; use vars qw($JCODE $SEARCH $NGRAM $LIMIT_STRING); $JCODE = ""; $NGRAM = 1; $LIMIT_STRING = 7*1024; $SEARCH = "http://www.google.co.jp/search?q=&ie=EUC-JP&oe=EUC-JP&hl=ja&btnG=Google+%8C%9F%8D%F5&lr=lang_ja&num=100"; my $TYPE = "名詞|未知語"; my $IGNORE = '^.$|^..$|^&|[=<>]|' . "^[\0-\100\133-\140\176-\177]*\$"; my $IGNORE_TYPE = "代名詞|非自立|接尾"; my $UNDER = 1; my $DOCS_MODE = 0; my $READ_LIST = 0; my $WRITE_LIST = 0; my @file = (); while (@ARGV) { $_ = shift @ARGV; if ($_ eq "-u") { $UNDER = shift @ARGV; } elsif ($_ eq "-r") { $READ_LIST = 1; } elsif ($_ eq "-l") { $WRITE_LIST = 1; } elsif ($_ eq "-d") { $DOCS_MODE = 1; } elsif ($_ eq "-n") { $NGRAM = shift @ARGV; } elsif ($_ eq "-j") { $JCODE = shift @ARGV; } elsif ($_ eq "-s") { $SEARCH = shift @ARGV; } else { push @file, $_; } } die "usage: $0 [-r] [-d] [-n #] [-l [-u #]] [-j code] file...\n" unless (@file); use vars qw($CHASEN_IN $CHASEN_OUT); open2($CHASEN_IN, $CHASEN_OUT, "chasen"); # # ファイルのリスト # sub files($); sub files($) { my($file) = @_; if (-f $file) { return $file; } elsif (-d $file) { local $_; my @file = (); foreach (glob "$file/*") { push @file, files($_); } return @file; } } # # HTMLをテキストに変換(タグは飛ばす) # sub html2text($@) { my($xjcode, @buf) = @_; my $jcode = ""; local $_; my @buf2 = (); foreach (@buf) { if (($jcode eq "") && $_ && (/charset=([^"'\s]+)/i)) { $jcode = $1; } s/<[^>]*>/ /g; s/\s+/ /g; push @buf2, $_; } $_ = join "\n", @buf2; if ($jcode =~ /UTF-?8/i) { $_ = Jcode->new($_, "utf8")->euc; } elsif ($jcode =~ /SHIFT_JIS/i) { $_ = Jcode->new($_, "sjis")->euc; } elsif ($jcode =~ /EUC-JP/i) { # $_ = Jcode->new($_, "euc")->euc; } elsif ($jcode =~ /ISO-2022-JP/i) { $_ = Jcode->new($_, "jis")->euc; } elsif ($xjcode) { $_ = Jcode->new($_, $xjcode)->euc; } else { $_ = Jcode->new($_)->euc; } return $_; } # # ファイルを読む(タグは飛ばす) # sub readHTML($) { my($file) = @_; my($IN); local $_; my $jcode = ""; open $IN, $file or return (); my @buf = <$IN>; close $IN; return html2text($JCODE, @buf); } # # 単語頻度 # sub freq(@) { my @file = @_; my %freq = (); foreach (@file) { warn "reading $_\n"; my %freq_file = (); local $_ = readHTML($_); my @buf = split /\s+/, $_; my @prev = (); foreach (@buf) { $_ = substr $_, 0, $LIMIT_STRING; print $CHASEN_OUT $_, "\n"; do { $_ = <$CHASEN_IN>; chomp; @_ = split; if ((defined $_[2]) && ($_[2] =~ /$IGNORE/)) { # nop } elsif ($_[3] && ($_[3] =~ /$IGNORE_TYPE/)) { # nop } elsif ($_[3] && ($_[3] =~ /$TYPE/)) { my $key = $_[2]; my $tmp = $key; $key = "@prev $key"; push @prev, $tmp; shift @prev while (@prev >= $NGRAM); if ($DOCS_MODE && (! $freq_file{$key})) { $freq_file{$key}++; $freq{$key}++; } elsif (! $DOCS_MODE) { $freq{$key}++; } } } while ($_ && ($_ ne "EOS")); } } return %freq; } # # リストの読み込み # sub readList(@) { my @file = @_; local $_; my %freq = (); my($IN); foreach (@file) { open $IN, $_ or next; while (<$IN>) { chomp; @_ = split /: /, $_, 2; next if (@_ < 2); $_[0] =~ s/\s//g; $freq{$_[1]} = $_[0]; } } return %freq; } # # wget # sub wget($) { my($uri) = @_; my($IN, $OUT); open2($IN, $OUT, "wget", "-i", "-", "-O", "-", "-U", "Mozilla/4.0 (Webtf)", "-T", 10, "-t", 2); print $OUT $uri, "\n"; close $OUT; my @buf = <$IN>; close $IN; wait; return @buf; } # # 文書の点数付け # sub score($%) { my($uri, %freq) = @_; my @buf = wget($uri); local $_ = html2text("", @buf); @buf = split /\s+/, $_; my $score = 0; my @prev = (); my %freq_file = (); my $size = 0; foreach (@buf) { $_ = substr $_, 0, $LIMIT_STRING; $size += length $_; print $CHASEN_OUT $_, "\n"; do { $_ = <$CHASEN_IN>; chomp; @_ = split; if ((defined $_[2]) && ($_[2] =~ /$IGNORE/)) { # nop } elsif ($_[3] && ($_[3] =~ /$IGNORE_TYPE/)) { # nop } elsif ($_[3] && ($_[3] =~ /$TYPE/)) { my $key = $_[2]; my $tmp = $key; $key = "@prev $key"; push @prev, $tmp; shift @prev while (@prev >= $NGRAM); if ($freq{$key} && $DOCS_MODE && (! $freq_file{$key})) { $freq_file{$key}++; $score += $freq{$key}; } elsif ($freq{$key} && (! $DOCS_MODE)) { $score += $freq{$key}; } } } while ($_ && ($_ ne "EOS")); } return $score / sqrt($size+1); } # # URLエンコード # sub encode($) { my($s) = @_; $s =~ s|[^\w]|'%' . uc(unpack('H2', $&))|eg; return $s; } # # 検索 # sub search($) { my($key) = @_; $key =~ s/^\s+//; $key = encode($key); my $uri = $SEARCH; $uri =~ s//$key/g; my @buf = wget($uri); my @uri = (); local $_; foreach (@buf) { while (/href=["']?([^"'>]+)/) { push @uri, $1; $_ = $'; } } @uri = grep $_ =~ /^http/, @uri; @uri = grep $_ !~ /google|rss|rdf|pdf/, @uri; return @uri; } # # メイン # my @file2 = (); foreach (@file) { push @file2, files($_); } @file = @file2; my %freq; if ($READ_LIST) { %freq = readList(@file); } else { %freq = freq(@file); } my @key = keys %freq; @key = sort {$freq{$b} <=> $freq{$a}} @key; if ($WRITE_LIST) { foreach (@key) { printf "%7d: %s\n", $freq{$_}, $_ if ($freq{$_} >= $UNDER); } exit; } elsif (@key == 0) { exit; } my @uri = search($key[0]); my %score = (); foreach (@uri) { $score{$_} = score($_, %freq); } @uri = sort {$score{$b} <=> $score{$a}} @uri; close $CHASEN_IN; close $CHASEN_OUT; wait; print "webtf

%7d: %s

\n";