#!/usr/bin/perl -w
# 2003.11.09 msittig@freeshell.org
# 2003.11.12 msittig@freeshell.org (tuning for speed)
# 2003.11.13 msittig@freeshell.org (documentation)
# 2003.11.14 msittig@freeshell.org (wubi tweaks)
# 2003.11.16 msittig@freeshell.org (cosmetic tweaks)
# 2003.11.19 msittig@freeshell.org ('markup' bug w/ Lingua module)
# 2003.11.20 msittig@freeshell.org (commenting & clarifying)
# 2004.01.24 msittig@freeshell.org (ruby, print format & dev_ed switch)
# 2004.04.29 msittig@freeshell.org (sidebar param mods for moz sidebar)
# 2004.05.10 msittig@freeshell.org (store dict in db for quicker access)
# 2005.02.24 msittig@freeshell.org (new jianti/fanti CEDICT format)
# 2005.04.10 msittig (fixed regexp backref miscount bug, rebuilt db)
use strict;
use utf8;
use lib qw{/home/msittig/local/lib /home/msittig/local/lib/perl5/site_perl} ;
use Lingua::ZH::CEDICT;
use Benchmark;
use CGI::Pretty qw(:standard :html3);
use CGI::Carp qw(fatalsToBrowser);
use Data::Dumper::Simple ;
use DB_File;
my $DEVELOPER_ED = 0 ;
my $MAX_WORD_SIZE = param('max_word_size'); $MAX_WORD_SIZE ||= 10;
my $DEBUG = param('debug'); $DEBUG ||= 0 ;
my $DATABASE = "database";
my $SIDEBAR = ( defined param('sidebar') ) ? 1 : 0;
my $ROMAN_CHARACTERS = '0-9a-zA-Z\'"!?~\n\r\t()., /\-=_;{}+*&\[\]·%';
my $SAFARI = param('safari'); $SAFARI ||= 0;
my $RUBY = param('ruby'); $RUBY ||=0;
my $PRINT_FORMAT = param('print'); $PRINT_FORMAT ||=0;
# UTF-8 encoding reg-exp, for parsing text
# http://examples.oreilly.com/cjkvinfo/perl/svpm99.pdf
my $utf8 = q{
[\x00-\x7F]
| [\xC2-\xDF][\x80-\xBF]
| \xE0[\xA0-\xBF][\x80-\xBF]
| [\xE1-\xEF][\x80-\xBF][\x80-\xBF]
| \xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]
| [\xF1-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]
| \xF8[\x88-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
| [\xF9-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
| \xFC[\x84-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
| \xFD[\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
};
my @TOOLTIP_SCRIPTS = "";
# Safari doesn't pop-up tooltips, so we include some Javascript
if ($SAFARI) {
my %script_layer = (
'-type' => 'text/javascript',
'-src' => 'layer.js'
);
my %script_mouse = (
-type=>'text/javascript',
-src=>'mouse.js'
);
my %script_tooltip = (
-type=>'text/javascript',
-src=>'tooltip.js'
);
@TOOLTIP_SCRIPTS = [\%script_layer, \%script_mouse, \%script_tooltip];
};
my $side_links = [
a({-href=>'../src/zhtool/'}, 'Source code'),
a({-href=>'http://www.mandarintools.com/cedict.html'}, 'CEDICT'),
a({-href=>'http://www.perl.org'}, 'Perl'),
a({-href=>'http://popjisyo.com/'}, 'PopJisyo'),
a({-href=>'http://lfw.org/shodouka/'}, 'Shodouka'),
a({-href=>'http://www.cnblog.org/blog/'}, 'CNBlog'),
a({-href=>'http://wubi.org/'}, 'Wubi.org'),
a({-href=>q{javascript:if (window.sidebar.addPanel) { window.sidebar.addPanel ('Chinese Reading Tool' , 'http://wubi.org/zhtool/?sidebar=1', ''); }} }, "Add this page to
Mozilla's Sidebar"),
];
my $alt_style = "";
$alt_style .= "span { letter-spacing: 5px; line-height: 3em; }" if $RUBY;
$alt_style .= "span { border: 0; }" if $PRINT_FORMAT;
$alt_style .= qq(
.note, div.checkboxes, .sample { display: none; }
textarea { height: 100px; position: relative; left: 15px; }
h1 { background-image: none; }
input[type="submit"] { position: absolute; top: 80px; left: 160px; }
h3 { margin: 0 1em; }
) if $SIDEBAR;
my $hidden_inputs = hidden( -name=>'sidebar',
-default=>'1' ) if $SIDEBAR; $hidden_inputs ||= '';
# Predeclare these -- they should be global
my %dict;
my %should_ignore;
# This code is for the fancy pinyin accents.
my $dict = Lingua::ZH::CEDICT->new();
$dict->init();
# Start printing html code
print header(-charset=>'utf-8');
use bytes; # Work around harmless but annoying "wide character" error.
print start_html(-title=>'Chinese Tool',
-encoding=>'utf-8',
-script=>@TOOLTIP_SCRIPTS,
-head=>[ meta({-http_equiv=>'Content-Type',
-content=>'text/html; charset=utf-8'}),
Link({-rel=>'shortcut icon',
-href=>'/favicon.png'})
],
-style=>{-src=>'zhtool.css',
-media=>'all',
-code=>$alt_style},
);
print h1('中文 Tool'),
start_form("POST", "./index.cgi", "utf-8"),
ol( { -class=>'note' }, li( $side_links ) ),
p("Enter Chinese text:"),
textarea(-name=>'zhtext',
-rows=>'10',
-columns=>'50'),
"
",
submit,
div( { -class=>'checkboxes' },
checkbox_group(-name=>'safari',
-values=>'Javascript pop-ups',
-title=>'Users of the Safari browser should check this box'),
checkbox_group(-name=>'ruby',
-values=>'Ruby',
-title=>'Display pinyin above the characters.'),
checkbox_group(-name=>'print',
-values=>'Print',
-title=>'Format for printing.'),
$hidden_inputs,
) unless( $PRINT_FORMAT );
print checkbox_group(-name=>'debug',
-values=>'Debug',
-title=>'Print debugging info'),
radio_group(-name=>'max_word_size',
-values=>['4', '5'],
-default=>'4') if $DEVELOPER_ED;
print end_form unless( $PRINT_FORMAT );
no bytes;
# Main parsing routine, executed when there is input
if (param('zhtext')) {
# For benchmarking purposes
my $t0 = new Benchmark;
my $time_string = localtime;
error("
$time_string
");
# Read the dictionary files into a hash
load_dictionary( \%dict, $DATABASE,
"cedict.utf8",
"msdict.utf8",
"override.utf8",
);
# Read the list of ignored characters (mostly punctuation)
load_dictionary( \%should_ignore, "ignoredb",
"ignore.utf8" );
error("
");
# Clean up the input
my $text_query = param('zhtext');
$text_query =~ s/\s*$//;
# Start recursive lookup of last ("next") word
my @text_arrayified = split_utf8('', $text_query);
my ($word, $text_left) = next_word($MAX_WORD_SIZE, \@text_arrayified);
my $parsed_text = markup($word);
while(scalar @$text_left) {
error("Adding $word to parsed_text.
");
($word, $text_left) = next_word($MAX_WORD_SIZE, $text_left);
$parsed_text = markup($word).$parsed_text;
}
error("