#!/usr/bin/perl -w # 2003.11.09 msittig@freeshell.org # 2003.11.12 msittig@freeshell.org (tuning for speed) # 2003.11.13 msittig@freeshell.org (documentation) # 2003.11.14 msittig@freeshell.org (wubi tweaks) # 2003.11.16 msittig@freeshell.org (cosmetic tweaks) # 2003.11.19 msittig@freeshell.org ('markup' bug w/ Lingua module) # 2003.11.20 msittig@freeshell.org (commenting & clarifying) # 2004.01.24 msittig@freeshell.org (ruby, print format & dev_ed switch) # 2004.04.29 msittig@freeshell.org (sidebar param mods for moz sidebar) # 2004.05.10 msittig@freeshell.org (store dict in db for quicker access) # 2005.02.24 msittig@freeshell.org (new jianti/fanti CEDICT format) # 2005.04.10 msittig (fixed regexp backref miscount bug, rebuilt db) use strict; use utf8; use lib qw{/home/msittig/local/lib /home/msittig/local/lib/perl5/site_perl} ; use Lingua::ZH::CEDICT; use Benchmark; use CGI::Pretty qw(:standard :html3); use CGI::Carp qw(fatalsToBrowser); use Data::Dumper::Simple ; use DB_File; my $DEVELOPER_ED = 0 ; my $MAX_WORD_SIZE = param('max_word_size'); $MAX_WORD_SIZE ||= 10; my $DEBUG = param('debug'); $DEBUG ||= 0 ; my $DATABASE = "database"; my $SIDEBAR = ( defined param('sidebar') ) ? 1 : 0; my $ROMAN_CHARACTERS = '0-9a-zA-Z\'"!?~\n\r\t()., /\-=_;{}+*&\[\]·%'; my $SAFARI = param('safari'); $SAFARI ||= 0; my $RUBY = param('ruby'); $RUBY ||=0; my $PRINT_FORMAT = param('print'); $PRINT_FORMAT ||=0; # UTF-8 encoding reg-exp, for parsing text # http://examples.oreilly.com/cjkvinfo/perl/svpm99.pdf my $utf8 = q{ [\x00-\x7F] | [\xC2-\xDF][\x80-\xBF] | \xE0[\xA0-\xBF][\x80-\xBF] | [\xE1-\xEF][\x80-\xBF][\x80-\xBF] | \xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF] | [\xF1-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF] | \xF8[\x88-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] | [\xF9-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] | \xFC[\x84-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] | \xFD[\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] }; my @TOOLTIP_SCRIPTS = ""; # Safari doesn't pop-up tooltips, so we include some Javascript if ($SAFARI) { my %script_layer = ( '-type' => 'text/javascript', '-src' => 'layer.js' ); my %script_mouse = ( -type=>'text/javascript', -src=>'mouse.js' ); my %script_tooltip = ( -type=>'text/javascript', -src=>'tooltip.js' ); @TOOLTIP_SCRIPTS = [\%script_layer, \%script_mouse, \%script_tooltip]; }; my $side_links = [ a({-href=>'../src/zhtool/'}, 'Source code'), a({-href=>'http://www.mandarintools.com/cedict.html'}, 'CEDICT'), a({-href=>'http://www.perl.org'}, 'Perl'), a({-href=>'http://popjisyo.com/'}, 'PopJisyo'), a({-href=>'http://lfw.org/shodouka/'}, 'Shodouka'), a({-href=>'http://www.cnblog.org/blog/'}, 'CNBlog'), a({-href=>'http://wubi.org/'}, 'Wubi.org'), a({-href=>q{javascript:if (window.sidebar.addPanel) { window.sidebar.addPanel ('Chinese Reading Tool' , 'http://wubi.org/zhtool/?sidebar=1', ''); }} }, "Add this page to
Mozilla's Sidebar"), ]; my $alt_style = ""; $alt_style .= "span { letter-spacing: 5px; line-height: 3em; }" if $RUBY; $alt_style .= "span { border: 0; }" if $PRINT_FORMAT; $alt_style .= qq( .note, div.checkboxes, .sample { display: none; } textarea { height: 100px; position: relative; left: 15px; } h1 { background-image: none; } input[type="submit"] { position: absolute; top: 80px; left: 160px; } h3 { margin: 0 1em; } ) if $SIDEBAR; my $hidden_inputs = hidden( -name=>'sidebar', -default=>'1' ) if $SIDEBAR; $hidden_inputs ||= ''; # Predeclare these -- they should be global my %dict; my %should_ignore; # This code is for the fancy pinyin accents. my $dict = Lingua::ZH::CEDICT->new(); $dict->init(); # Start printing html code print header(-charset=>'utf-8'); use bytes; # Work around harmless but annoying "wide character" error. print start_html(-title=>'Chinese Tool', -encoding=>'utf-8', -script=>@TOOLTIP_SCRIPTS, -head=>[ meta({-http_equiv=>'Content-Type', -content=>'text/html; charset=utf-8'}), Link({-rel=>'shortcut icon', -href=>'/favicon.png'}) ], -style=>{-src=>'zhtool.css', -media=>'all', -code=>$alt_style}, ); print h1('中文 Tool'), start_form("POST", "./index.cgi", "utf-8"), ol( { -class=>'note' }, li( $side_links ) ), p("Enter Chinese text:"), textarea(-name=>'zhtext', -rows=>'10', -columns=>'50'), "
", submit, div( { -class=>'checkboxes' }, checkbox_group(-name=>'safari', -values=>'Javascript pop-ups', -title=>'Users of the Safari browser should check this box'), checkbox_group(-name=>'ruby', -values=>'Ruby', -title=>'Display pinyin above the characters.'), checkbox_group(-name=>'print', -values=>'Print', -title=>'Format for printing.'), $hidden_inputs, ) unless( $PRINT_FORMAT ); print checkbox_group(-name=>'debug', -values=>'Debug', -title=>'Print debugging info'), radio_group(-name=>'max_word_size', -values=>['4', '5'], -default=>'4') if $DEVELOPER_ED; print end_form unless( $PRINT_FORMAT ); no bytes; # Main parsing routine, executed when there is input if (param('zhtext')) { # For benchmarking purposes my $t0 = new Benchmark; my $time_string = localtime; error("

$time_string

"); # Read the dictionary files into a hash load_dictionary( \%dict, $DATABASE, "cedict.utf8", "msdict.utf8", "override.utf8", ); # Read the list of ignored characters (mostly punctuation) load_dictionary( \%should_ignore, "ignoredb", "ignore.utf8" ); error("
"); # Clean up the input my $text_query = param('zhtext'); $text_query =~ s/\s*$//; # Start recursive lookup of last ("next") word my @text_arrayified = split_utf8('', $text_query); my ($word, $text_left) = next_word($MAX_WORD_SIZE, \@text_arrayified); my $parsed_text = markup($word); while(scalar @$text_left) { error("Adding $word to parsed_text.

"); ($word, $text_left) = next_word($MAX_WORD_SIZE, $text_left); $parsed_text = markup($word).$parsed_text; } error("

"); # Print output $parsed_text =~ s{(\n|\r\n)}{
}g; print h3("Result:") unless( $PRINT_FORMAT ); print p( { -class=>'result' }, "$parsed_text"); # Benchmark output. my $t1 = new Benchmark; my $td = timediff($t1, $t0); print p({-class=>'benchmark'},"This took ",timestr($td),".") unless( $PRINT_FORMAT ); } else { use bytes; print p( { -class=>'sample' }, "Try this: 毛主席万岁 and hover the mouse over the output."); no bytes; } print end_html; exit 1; ## Subroutines # load_dictionary: loads a file into memory for global use # TAKES: reference(hash) - in which to store contents of dic, # string - name of database file # array - names of dictionary files belonging in db # RETURNS: sub load_dictionary { my $hash_ref = shift; my $db_file = shift; my @dictionary_name = @_; my $last_rebuild_date = (stat $db_file)[9]; my( $rebuild ) = ( 0 ); foreach( @dictionary_name ) { error( "Checking age of $_
" ); ( my $age = (stat $_)[9] ) or die "couldn't get $_ age: $!"; $rebuild = 1 if $age > $last_rebuild_date; } rename( $db_file, $db_file.'.o' ) if $rebuild; tie( %$hash_ref, 'DB_File', $db_file ); if( $rebuild ) { error( "Rebuilding $db_file!
" ); my $count = 0 ; foreach my $file ( @dictionary_name ) { my $first_line = ; # one line of dictionary header metadata open(DIC, "< $file") or die( "Couldn't open $file" ) ; binmode( DIC ) ; error("Opened dictionary $file for reading into db.
Header is $first_line
"); while( ) { # regexp from (but modified) # http://germain.umemat.maine.edu/faculty/hiebeler/cedictscripts/cedictlookup m@^(.+?) ((.+?) )?(\[.+\])\s*(/.*/)\s*$@ ; my $simplified = $3 ; $simplified ||= $1 ; my $info = "$4 $5" ; chomp( $simplified, $info ) ; error( "Adding $simplified to the database. ($info)
" ) if $count++ < 3 ; $$hash_ref{ $simplified } = $info || 1 ; } close( DIC ) ; } } } # its_a_word: checks if the chunk is a word in the dictionary # TAKES: string with the word to check # RETURNS: string, 1 or 0 sub its_a_word { error( "Checking if $_[0] is a word..." ) ; return 1 if $dict{"$_[0]"}; return 0; } # next_word: # TAKES: scalar number of characters to try as next word # reference to hash of the remaining text # RETURNS: string with the next word # reference to hash of the remaining text, minus word # recursion in the hiz-ouse sub next_word { my $max_chars = shift; my $text_toparse = shift; my $chars_left = scalar @$text_toparse; error("Find a word in the last $max_chars characters of @$text_toparse ($chars_left)
"); # sanity check: if( $max_chars > $chars_left && $chars_left > 0 ) { $max_chars = $chars_left; error("Only $max_chars left, so – "); } my $is_this_a_word = ""; error("Pulling out $max_chars chars"); foreach my $char (1 .. $max_chars) { if( ! defined $$text_toparse[length(@$text_toparse) - 1] ) { $max_chars -= 1; my $this_many = $max_chars - 1; error("... max_chars $this_many next time."); last; } $is_this_a_word = pop(@$text_toparse)."$is_this_a_word" if( $$text_toparse[(scalar @$text_toparse) - 1] ne ""); error("... $is_this_a_word"); } error("
"); my $confirm_word = its_a_word($is_this_a_word) ; if( $confirm_word || $max_chars == 1) { error("yes!

") if $confirm_word ; error( "nope, but there's nowhere to go from here.

" ) if ! $confirm_word ; return $is_this_a_word, $text_toparse; } else { error("nope.
"); push @$text_toparse, split_utf8('', $is_this_a_word); return next_word($max_chars - 1, $text_toparse); } } # word_so_far sub word_so_far { if ($_[0] ne "") { return $_[0]; } else { return ""; } } # split_utf8: splits a utf-8 string and returns array # TAKES: a dummy variable, so I can switch to normal split # by just search/replacing split_utf8 # string to split, in mixed ascii/utf-8 # RETURNS: array of single characters # perl 5.8's split function doesn't seem to handle utf-8? sub split_utf8 { my $make_linux_happy = shift; error("Adding this back to text to be parsed: $_[0]

"); my @split = $_[0] =~ /$utf8/gox; return @split; } # markup: adds HTML markup to words with pinyin/definition # TAKES: string containing a single word # RETURNS: string containing the word, marked up in html sub markup { my $new_word = shift; my $tag_open = qq{}; my $tag_close = qq{}; my $tag_inner = $new_word; if ($should_ignore{$new_word}) { #in dict of ignored punctuation error("Didn't markup because I'm supposed to ignore: $new_word. "); return $new_word; } elsif ($new_word =~ /[\x00-\x7F]/) { #single-bit ascii (western) error("Didn't markup because it's not multi-byte: $new_word. "); return $new_word; } else { # Now the real work, actually marking something up: my $tooltip_message = $dict->utf8Pinyin($dict{$new_word}) if $dict{$new_word}; $tooltip_message ||= "Not found."; if( $RUBY ) { $tooltip_message =~ s/^.*\[//g; $tooltip_message =~ s/\](.*)$//g; my $pinyin = $tooltip_message; # gets the pinyin $tooltip_message = $1; # gets the definition $tooltip_message =~ s#(^ *\/)|(\/$)##g; my @syllable = split( / /, $pinyin ); my $ii = 0; $tag_inner = ""; # clear this foreach( split_utf8( '', $new_word ) ) { $tag_inner .= "".$_."".$syllable[$ii++].""; } # This next line puts all the pinyin together. # Uncomment it to see a different ruby strategy. # $tag_inner = "$new_word$pinyin"; } # This is default - Safari may change it. $tag_attribute = " title=\"$tooltip_message\""; if( $SAFARI ) { $tooltip_message =~ s/([\'\"])/\\$1/g; # preempt Javascript breakage $tag_attribute = qq{ onmouseover=\"showTooltip(true, \'$tooltip_message\', this)\" onmouseout=\"showTooltip(false)\"}; } return "$tag_open$tag_attribute$tag_open_2$tag_inner$tag_close"; } die "Shouldn't have reached this point."; } # sub error prints error messages if DEBUG flag is not zero sub error { print "$_[0]" if $DEBUG; }