#!/usr/bin/perl
#
# WARNING: This file contains UTF-8 Unicode characters. Please take any
# necessary precautions to avoid corruption when transmitting this file.
#
#    Rōmaji to Kana transliterator
#    Copyright (C) 2007  Daniel A. Ramaley
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#    or visit http://www.gnu.org/copyleft/gpl.txt.
#
#
# Documentation about what this program does is provided in the &about function.

use v5.8.1;              # Make sure Unicode support is decent
use warnings 'all';
use strict;
use charnames ':full';   # Allow English names for Unicode characters
local *vianame = \&charnames::vianame; # Alias this because it is used a lot.
use utf8;                # Needed if the source file contains Unicode
use Encode qw();
use HTML::Entities qw(&encode_entities);
use POSIX qw(&strftime);

# These variables are used by the transliteration process. The first four
# contain static data and are created by &define_vars:
#     @gojūon      - Holds the 50-sounds chart so it can be easily displayed.
#     %kana        - Hash table to hold 仮名 mappings.
#     $kana_regex  - Used to break Latin strings into rōmaji syllables.
#     %preprocess  - Hash table to hold Latin -> Latin mappings that are applied
#                    prior to the rōmaji -> 仮名 conversion.
#     $pre_regex   - Similar function as $kana_regex, but for %preprocess.
#     @yōon        - Holds the yōon chart so it can be easily displayed
# The last variable must be emptied for each run:
#     @quote_stack - Used by &special.
our (@gojūon, %kana, $kana_regex, %preprocess, $pre_regex, @yōon);
our @quote_stack = ();
&define_vars unless %kana;

# These variables are used by the XHTML interface side of the script.
our %FORM = ();          # Form input
our $ENCODING = undef;   # Output encoding
our $SELF;               # URL for this script
our $MODIFIED;           # Date this file was last changed
&define_htmlvars;


if (defined $FORM{'action'}) {
    # Use a dispatch table instead of a messy if/elsif condition.
    my %actions = ('about' => \&about, 'css' => \&css, 'kana' => \&kana,
                   'main' => \&main,   'source' => \&source,
                   'syllables' => \&syllables,
        );
    &error('Bad input given.') if not defined $actions{$FORM{'action'}};
    &{$actions{$FORM{'action'}}};
} else {
    &main;
}
exit;


################################################################################
################################################################################
####            Functions to perform ローマ字 to 仮名 conversion            ####
################################################################################
################################################################################


################################################################################
# This function builds a 3-dimensional array that contains the 五十音 (gojūon)
# chart in 縦書き (tategaki "vertical writing") ordering. The 3rd dimension is
# used to store all of the rōmaji, ひらがな, and カタカナ in one array.
# This must be called after the standard gojūon have been defined in %kana
# but before the non-standard additions have been added.
sub define_gojūon {
    my @headings = ('p','b','d','z','g','#','w','r','y','m','h','n','t','s','k','');
    my @vowels = ('a', 'i', 'u', 'e', 'o');
    foreach my $row (@vowels) {
        my @thisrow = ();
        foreach my $col (@headings) {
            if ($col eq '#') {
                push @thisrow, ($row eq 'a' ? ['n', 'ん', 'ン'] :
                                [(undef) x 3]);
            } else {
                my $syl = $col . $row;
                push @thisrow, [ $syl, $kana{$syl}, $kana{uc $syl} ];
            }
        }
        push @gojūon, \@thisrow;
    }
}

################################################################################
# Programmatically define the 仮名. Unicode character names are used instead
# of hex codes for easier reading and debugging, at the expense of execution
# speed.
sub define_vars {
    # Variables used in this function. Don't touch.
    my @vowels = ('a', 'i', 'u', 'e', 'o');

    # Standard 仮名
    ###############

    # Small letters
    foreach (@vowels, 'ka', 'ke', 'tu', 'ya', 'yu', 'yo', 'wa') {
        my $uc = uc;
        $kana{"x${_}"}  = sprintf '%c', &vianame("HIRAGANA LETTER SMALL ${uc}");
        $kana{"X${uc}"} = sprintf '%c', &vianame("KATAKANA LETTER SMALL ${uc}");
    }
    # These small カタカナ are only used by the アイヌ語 (Ainu language) but are
    # included here for completeness.
    foreach ('KU', 'SI', 'SU', 'TO', 'NU', 'HA', 'HI', 'HU', 'HE', 'HO', 'MU',
             'RA', 'RI', 'RU', 'RE', 'RO') {
        $kana{"X${_}"} = sprintf '%c', &vianame("KATAKANA LETTER SMALL ${_}");
    }

    # Main 五十音 (gojūon) 仮名. Also 濁点 (dakuten) and 半濁点 (handakuten)
    foreach my $consonant ('','k','s','t','n','h','m','y','r','w', # 五十音
                           'g', 'z', 'd', 'b', 'p') { # 濁点、半濁点
        foreach my $vowel (@vowels) {
            my $cv = $consonant . $vowel;
            my $uc = uc $cv;
            my ($hiragana, $katakana);
            $kana{$cv} = sprintf '%c', $hiragana
                if $hiragana = &vianame("HIRAGANA LETTER ${uc}");
            $kana{$uc} = sprintf '%c', $katakana
                if $katakana = &vianame("KATAKANA LETTER ${uc}");
        }
    }
    # ん is hard to define with the above loop:
    @kana{'n', 'N'} = ('ん', 'ン');

    # This needs to be done here after the main 五十音 have been defined and
    # before any extra definitions have been added (such as non-existent "wu").
    &define_gojūon;
    # This one can be done later, but it is here because it is related to the
    # &define_gojūon call.
    &define_yōon;

    # Another way to access archaic ゐ and ゑ:
    @kana{'wyi', 'WYI', 'wye', 'WYE'} = ('ゐ', 'ヰ', 'ゑ', 'ヱ');
    # Non-standard syllables to complete the chart:
    @kana{'yi','YI','ye','YE','wu','WU'} = ('い','イ','いぇ','イェ','う','ウ');

    # Redefine カタカナ　ゐ (wi) ゑ (we) and ヲ (wo) to reflect modern usage.
    # The last one provides access to ヲ that was redefined. However, it is
    # *very* nonstandard (i've never seen it anywhere else).
    @kana{'WI', 'WE', 'WO', 'WYO'} = ('ウィ', 'ウェ', 'ウォ', 'ヲ');

    # Add 拗音 (Yōon) syllables. Note that "yi" and "ye" forms are seldom used.
    foreach my $consonant ('k','s','t','n','h','m','r','g','z','d','b','p') {
        my $uc = uc $consonant;
        my ($hira, $kata) = @kana{"${consonant}i", "${uc}I"};
        @kana{map { "${consonant}y${_}" } @vowels} =
            map {"${hira}${_}"} ('ゃ', 'ぃ', 'ゅ', 'ぇ', 'ょ');
        @kana{map { "${uc}Y" . uc $_ } @vowels} =
            map {"${kata}${_}"} ('ャ', 'ィ', 'ュ', 'ェ', 'ョ');
    }

    # Alias "l" to be the same as "r"
    foreach my $r (grep /^r/io, keys %kana) {
        (my $l = $r) =~ tr/Rr/Ll/;
        $kana{$l} = $kana{$r};
    }

    # Seldom-used and less standard combinations
    ############################################

    # Note that a few definitions below are actually quite common, "tsu" and
    # "fu" for instance.

    # Combinations that use small vowels
    @kana{'vu', 'VU'} = ('ゔ', 'ヴ'); # Initialize these so the loop will work.
    foreach (['v',  'vu'], ['kh', 'ku'], ['kw', 'ku'], ['q',  'ku'],
             ['sw', 'su'], ['ts', 'tu'], ['tw', 'to'], ['f',  'hu'],
             ['mh', 'mu'], ['mw', 'mu'], ['wh', 'u' ], ['gh', 'gu'],
             ['gw', 'gu'], ['zh', 'zu'], ['zw', 'zu'], ['dw', 'do'],
             ['bh', 'ba'], ['bw', 'bi'], ['ph', 'pu'], ['pw', 'pu']) {
        my ($r, $consonant) = @$_;
        my ($hira, $kata) = @kana{$consonant, uc ($consonant)};
        @kana{map { "${r}${_}" } @vowels} =
            map { "${hira}${_}" } ('ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ');
        @kana{map { uc "${r}${_}" } @vowels} =
            map { "${kata}${_}" } ('ァ', 'ィ', 'ゥ', 'ェ', 'ォ');
    }
    # Exceptions to the above that use ゎ (small wa)
    @kana{'kwa', 'KWA', 'mwa', 'MWA', 'gwa', 'GWA', 'zwa', 'ZWA', 'bwa', 'BWA',
          'pwa', 'PWA'} = ('くゎ', 'クヮ', 'むゎ', 'ムヮ', 'ぐゎ', 'グヮ',
                           'ずゎ', 'ズヮ', 'びゎ', 'ビヮ', 'ぷゎ', 'プヮ');
    # Exceptions that map to just 1 character
    @kana{'tsu', 'TSU', 'fu', 'FU', 'whu', 'WHU', 'vu', 'VU'} =
        ('つ', 'ツ', 'ふ', 'フ', 'う', 'ウ', 'ゔ', 'ヴ');
    # Note that the syllables VA, VI, VE, VO exist in カタカナ, but have been
    # replaced in modern usage with the combinations VU plus a small vowel. If
    # the older V syllables are desired, they have been defined here with the
    # X prefix. Note that this is *very* nonstandard:
    @kana{'XVA', 'XVI', 'XVU', 'XVE', 'XVO'} = ('ヷ', 'ヸ', 'ヴ', 'ヹ', 'ヺ');

    # Combinations that use small ya, etc.
    foreach (['vy', 'vu'], ['sh', 'si'], ['j',  'zi'], ['jy', 'zi'],
             ['th', 'te'], ['ch', 'ti'], ['cy', 'ti'], ['fy', 'hu'],
             ['dh', 'de']) {
        my ($r, $consonant) = @$_;
        my ($hira, $kata) = @kana{$consonant, uc ($consonant)};
        @kana{map { "${r}${_}" } @vowels} =
            map { "${hira}${_}" } ('ゃ', 'ぃ', 'ゅ', 'ぇ', 'ょ');
        @kana{map { uc "${r}${_}" } @vowels} =
            map { "${kata}${_}" } ('ャ', 'ィ', 'ュ', 'ェ', 'ョ');
    }
    # Exceptions that map to just 1 character
    @kana{'shi','SHI','ji','JI','chi','CHI'} = ('し','シ','じ','ジ','ち','チ');

    # Bizarre aliases
    @kana{'ca', 'ci', 'cu', 'ce', 'co'} = @kana{'ka', 'si', 'ku', 'se', 'ko'};
    @kana{'CA', 'CI', 'CU', 'CE', 'CO'} = @kana{'KA', 'SI', 'KU', 'SE', 'KO'};
    @kana{'jhe', 'JHE', 'dzi', 'DZI'} = @kana{'za', 'ZA', 'di', 'DI'};

    # Rare "ŋ" or "ng" sound formed by combining K consonants with 半濁点
    # These are preceded with "x" to avoid collision with the far more common
    # "んご" sequence.
    my $半濁点 = "\N{COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK}";
    @kana{map { "xng${_}" } @vowels} = map { $kana{"k${_}"} . $半濁点 } @vowels;
    @kana{map { 'XNG' . uc($_) } @vowels} =
        map { $kana{'K' . uc ($_)} . $半濁点 } @vowels;

    # Numerals and punctuation
    ##########################

    # Numerals
    @kana{'0','1','2','3','4','5','6','7','8','9'} =
        ('０','１','２','３','４','５','６','７','８','９');

    # Punctuation. Some of these are not traditionally used.
    @kana{' ',',','.','!','?',';',':'} = ('　','、','。','！','？','；','：');
    @kana{'~','#','$','%','&','@','-'} = ('〜','＃','＄','％','＆','＠','ー');
    @kana{'*','+','/','\\','='} = ('＊','＋','／','＼','＝');
    @kana{'^','_','|'} = ('＾','＿','｜');
    @kana{'..','...'} = ('‥','……'); # Traditional Japanese ellipsis is 6 dots

    # Quotation marks
    # Note: Distinguish opening and closing quotes is done with logic in the
    # &special function which is called when a character sequence is not
    # recognized. So do not define quotes here.
    #@kana{'"','"','""','""'} = ('「','」','『','』');

    # Braces.
    # Note that most of these definitions are *very* non-standard and were
    # chosen such that it is possible to type any of the braces, even if they
    # are seldom used. Commonly used braces should work as expected.
    @kana{'(',  ')',   '{', '}',
          '[',  ']',  '[[', ']]',
          '<',  '>',  '<<', '>>',
          '[(', ')]', '[{', '}]',
          '([', '])', '({', '})'} =
        ('（','）','｛','｝', # Parenthesis, Curly Bracket
         '［','］','〚','〛', # Square Bracket, White Square Bracket
         '〈','〉','《','》', # Angle Bracket, Double Angle Bracket
         '【','】','〖','〗', # Black Lenticular Bracket, White Lenticular Bracket
         '〔','〕','〘','〙', # Tortoise Shell Bracket, White Tortoise Shell Bracket
        );

    # Regex definition
    ##################

    # This regular expression is used to split the input string into syllables.
    # Though this regex is quite large, the structure of it is very simple, with
    # the interesting stuff happening in &$regex_sort.
    $kana_regex = join '|', map { quotemeta } sort regex_sort keys %kana;
    $kana_regex = qr/($kana_regex)/o;

    # Preprocess definition
    #######################

    # %preprocess is used in case of circumflex or macron characters in the
    # input. Most rōmaji systems use these characters to represent long vowels.
    @preprocess{'Â','Î','Û','Ê','Ô','Ā','Ī','Ū','Ē','Ō'} =
        ('A-','I-','U-','E-','O-') x 2;
    @preprocess{'â','î','û','ê','ô','ā','ī','ū','ē','ō'} =
        ('aa','ii','uu','ei','ou') x 2;
    @preprocess{'Ŋ', 'ŋ'} = ('XNG', 'xng');
    @preprocess{"N\N{COMBINING MACRON}","n\N{COMBINING MACRON}"} = ("N'","n'");

    $pre_regex = join '|', sort keys %preprocess;
    $pre_regex = qr/($pre_regex)/o;
}

################################################################################
# This function builds a 3-dimensional array that containing the 拗音 (yōon)
# chart in 縦書き (tategaki "vertical writing") ordering. The 3rd dimension is
# used to store all of the rōmaji, ひらがな, and カタカナ in one array.
sub define_yōon {
    foreach my $row ('a', 'u', 'o') {
        my @thisrow = ();
        foreach my $col ('p','b','d','z','g','r','m','h','n','t','s','k') {
            push @thisrow, ["${col}y${row}",
                            $kana{$col.'i'}.$kana{'xy'.$row},
                            $kana{uc $col.'I'}.$kana{'XY'.uc $row}];
        }
        push @yōon, \@thisrow;
    }
}

################################################################################
# Used to sort syllables for use in a regex. The sort order should be such that
# syllables are matched correctly, with a secondary emphasis on performance of
# the regular expression parser.
#
# Please note that it is likely that my understanding of the inner workings of
# the regular expression parser is incomplete or incorrect and there may be a
# more efficient way to order these. I do not claim that this is the *best* way,
# just that it should work and not give abysmal performance.
sub regex_sort {
    # Organize entries loosely by how often entries are likely to be
    # encountered: put lowercase characters (ひらがな) first followed by
    # uppercase (カタカナ) and then symbols.
    ($b =~ /^[a-z]/o) <=> ($a =~ /^[a-z]/o) or
    ($b =~ /^[A-Z]/o) <=> ($a =~ /^[A-Z]/o) or
    # Group entries with the same initial characters together to help the
    # regex parser skip groups of non-matches more quickly
    substr($a, 0, 1) cmp substr($b, 0, 1) or
    # Then reverse sort by length. This is done, for example, so "na" has a
    # chance to match before "n" and "..." will be matched as 1 entry
    # instead of three occurences of "."
    length($b) cmp length($a) or
    # And finally by ASCII value (mostly just to ensure a predictable order)
    $a cmp $b
}

################################################################################
# Handle character sequences that were not found in %kana. This is where Latin
# quotes are converted into angle brackets. Completely unrecognized sequences
# are returned with tags so that they may be displayed differently.
sub special {
    my ($chars) = @_ if @_;
    my %brackets = ('"' => ['「', '」'], '""' => ['『', '』']);

    my @chunks = (map { length($_) ? $_ : () } # Eat empty strings from split
                  split /(\"\"|\")/o, $chars); # Split into chunks of quotes
    my $returnval = '';
    foreach (@chunks) {
        if (exists $brackets{$_}) {
            if (@quote_stack and $quote_stack[-1] eq $_) {
                # Close an open set of brackets
                $returnval .= ${$brackets{$_}}[1]; pop @quote_stack;
            } else {
                # Start a new pair of brackets
                $returnval .= ${$brackets{$_}}[0]; push @quote_stack, $_;
            }
        } else {
            # This chunk was not recognized!
            $returnval .= "<BAD>$_</BAD>";
        }
    }
    return $returnval;
}

################################################################################
# This function's purpose is very simple: take a string of ローマ字 and return
# 仮名. This is done in 3 passes. The first normalizes the input by converting
# double consonants and macrons into their longer forms. Pass 2 identifies what
# parts of the input can be processed and converts recognizable chunks into
# 仮名 while tagging unrecognizable chunks. Pass 3 does some final cleanup on
# the output and may be easily commented out if so desired.
sub transliterate {
    my ($input) = @_ if @_;
    my $output;

    ##########
    # PASS 1 #
    ##########

    # Convert doubled consonants into "xtu". This has to be done before long
    # vowel conversion so that ŋ and Ŋ are handled correctly.
    $input =~ s/([bcdfghjklmnpqrstvwxyzŋ])\1/xtu$1/go;
    $input =~ s/([BCDFGHJKLMNPQRSTVWXYZŊ])\1/XTU$1/go;

    # Convert macron long vowels into explicit doubled vowels.
    $input =~ s/$pre_regex/$preprocess{$1}/ego;

    ##########
    # PASS 2 #
    ##########

    $output = join '', (
        # This line is where rōmaji finally becomes 仮名.
        map { exists $kana{$_} ? $kana{$_} : &special($_) }
        # Eat empty strings added by split
        map { length($_) ? $_ : () }
        # Break the string into chunks of recognized characters
        map { split /$kana_regex/o }
#Consider enabling the next line and then at this position add an "ng$ -> n"
#mapping. possibly also add "s$ -> su" and some other whispered vowels.
        # Break string into chunks of uppercase, lowercase, and other characters
        #map { m/([a-zŋ]+|[A-ZŊ]+|[^a-zŋA-ZŊ]+)/go }
        # Single quote is used to differentiate にゃ (nya) from んや (n'ya)
        split /\'/o, $input
    );

    ##########
    # PASS 3 #
    ##########

    # Add middle dots between カタカナ words.
    1 while ($output =~ s/(\p{inKatakana})\s(\p{inKatakana})/$1・$2/go);

    return $output;
}


################################################################################
################################################################################
####                           Encoding functions                           ####
################################################################################
################################################################################


################################################################################
# Convert to EUC-JP
sub encode_eucjp {
    my $val = $_[0];
    return &Encode::encode('euc-jp', $val, Encode::FB_XMLCREF);
}

################################################################################
# Convert to ISO-2022-JP
sub encode_iso2022jp {
    my $val = $_[0];
    return &Encode::encode('iso-2022-jp-1', $val, Encode::FB_XMLCREF);
}

################################################################################
# Encode non-ASCII Unicode characters with their Unicode character number.
sub encode_iso88591 {
    # Do not use &encode_entities here because it also encodes quote characters
    return join('',
                map { ord > 127 ? sprintf('&#x%x;', ord) : $_ }
                split //, $_[0]);
}

################################################################################
# Convert to Shift_JIS
sub encode_shiftjis {
    my $val = $_[0];
    return &Encode::encode('shiftjis', $val, Encode::FB_XMLCREF);
}

################################################################################
# UTF-8 encoding is trivial since it is Perl's native format
sub encode_utf8 { return $_[0]; }


################################################################################
################################################################################
####                          Interface functions                           ####
################################################################################
################################################################################


################################################################################
# Print the help/about page
sub about {
    (my $out = <<EOF) =~ s/^    //gmo;
    <p>This program accepts input in ローマ字 (rōmaji) and produces the
    equivalent ひらがな (hiragana) and カタカナ (katakana) output. This program
    does <em>not</em> map arbitrary English phonemes into the Japanese sound
    system, nor does it translate from English to Japanese.</p>

    <p>Transliteration is done by use of an associative array (hash table) that
    contains mappings from syllables written in Latin letters to 仮名 (kana).
    ひらがな may be produced by entering lowercase letters, カタカナ by using
    uppercase. Some punctuation is also allowed, though the Romanization for
    the lesser-used brackets is quite non-standard.</p>

    <p>Some attempt has been made to simultaneously support the most common
    Romanization systems (<a href="http://en.wikipedia.org/wiki/Hepburn_romanization">修正ヘボン式ローマ字</a> (Modified Hepburn), <a href="http://en.wikipedia.org/wiki/Kunrei-shiki_R%C5%8Dmaji">訓令式ローマ字</a> (Kunrei-siki), and <a href="http://en.wikipedia.org/wiki/Nihon-shiki_R%C5%8Dmaji">日本式ローマ字</a>
    (Nihon-siki), though completely meeting that goal is impossible. Where
    different systems conflict, precedence has been given to 日本式ローマ字
    since it is the most regular and has a 1-to-1 relation between 仮名 and
    rōmaji.</p>

    <p>Unrecognized characters in the input will be returned unprocessed but
    highlighted in a different color.</p>

    <h1>Quotes</h1>

    <p>Apostrophes (single quotes) are used to differentiate ん (syllabic n)
    from a doubled consonant when it appears before one of な に ぬ ね の. They
    are similarly used to prevent ん from binding with vowels and y syllables.
    For example, 「んな」, 「んあ」, and 「んや」 ("n'na", "n'a", and "n'ya")
    all require an apostrophe, while 「っな」, 「な」, and 「にゃ」 ("nna",
    "na", and "nya") do not.</p>

    <p>An attempt has been made to translate Latin quotes properly such that
    regular double and single quotes as are used in English will be converted
    into left and right corner brackets (「」『』) as appropriate. However,
    there is a peculiarity with the way that white corner brackets (『 and 』)
    are represented:</p>

    <p>Normally single quotes would be used to represent white corner brackets,
    but this is not possible since single quotes have already been used as
    described previously. Instead, white corner brackets are represented by
    doubled double quotes (""). This is admittedly non-standard and
    may change if a better solution is discovered.</p>

    <p>A limitation of the quote translation is that quotes must be balanced.
    That is, if a set of regular corner brackets is opened, it is necessary to
    close it before the next set of the same type of bracket may be opened. For
    example, this ordering of brackets is possible: 「『』『』」「」 (and may
    be generated with this rather improbable sequence: "\'""\'""\'""\'""\'"\'"\'")
    while this one is not: 「『」』.</p>

    <h1><em>X</em> Characters</h1>

    <p>Some characters are difficult to represent in rōmaji. These characters\'
    Romanization has been preceded with an "x". The cases where "x" is used
    are:</p>

    <ul>
        <li>Small versions of characters may be accessed using "x". For example,
        to produce a 促音 (sukuon, or "little tsu") enter "xtu" or "XTU", which
        will output っ or ッ. Of course, double consonants will be replaced by
        the 促音 without having to explicitely specify "xtu" or "XTU". A number
        of small カタカナ that are only used by the Ainu language are supported,
        though they are included solely for completeness as the intention of
        this program is for transliteration of Japanese, not Ainu.</li>

        <li>The カタカナ syllables ヷ (va), ヸ (vi), ヹ (ve), ヺ (vo) have been
        replaced in modern usage with ヴ (vu) plus a small vowel. The older
        single-symbol versions of these syllables may be accessed with "XVA",
        "XVI", etc.</li>

        <li>The very rare "ŋ" or "ng" sounds are produced this way using strings
        such as "xnga". This was done to avoid collision with the far more
        prevalent "nga" sequence that should more commonly be rendered as
        「んが」.</li>
    </ul>

    <h1>Output</h1>

    <p>Several output encodings are available. UTF-8 is the preferred encoding,
    and all modern web browsers should natively support it. ISO-8859-1 encoding
    with XHTML &amp;#codes; is provided as a second option for older browsers
    that do not properly handle Unicode. In addition, the following legacy
    encodings are provided: EUC-JP, Shift_JIS, ISO-2022-JP. Please note that the
    legacy encodings have not been as thoroughly tested.</p>

    <p>All pages except the Source Code page inherit the encoding set on the
    main page. The Source Code page always uses UTF-8 since that is what the
    script is actually encoded in and a different encoding may break it.</p>

    <h1>Meta</h1>

    <p>The idea for this program was borrowed from Joel Yliluoma
    (Bisqwit)'s <a href="http://kanjidict.stc.cx/hiragana.php">Romaji to
    hiragana converter</a>. Though the source for Bisqwit's program is
    generously provided, no actual code was borrowed.</p>

    <p> The main reasons for writing this program were to gain a deeper
    understanding of rōmaji to 仮名 transliteration, to play with Unicode and
    other encoding methods for non-English languages, to work more with 仮名,
    and to start working with 漢字 (kanji) (the latter only appears in the
    comments and documentation for now).</p>

    <p>The previously linked <a href="http://en.wikipedia.org/">Wikipedia</a>
    articles, as well as several others linked from the
    <a href="http://en.wikipedia.org/wiki/Japanese_writing">Japanese Writing</a>
    entry were helpful in researching how transliteration is done. The Kotoeri
    input method used on Mac OS X was also helpful in deciding how to deal with
    various edge cases.</p>
EOF
    &display_xhtml_header('about');
    print '<div class="info">', &encode($out), '</div>';
    &display_xhtml_footer;
}

################################################################################
# Return the CSS document.
sub css {
    (my $out = <<'EOF') =~ s/^    //gmo;
    @charset "utf-8";
    body {
        margin: 0 auto;
        padding: 0;
        font-family: sans-serif;
        background-color: #FFF0E0;
        color: #1F1F1F;
    }

    div {
        width: 35em;
        padding: 20px 30px;
        margin: 5px auto;
        background-color: #FFE7C6;
        text-align: justify;
    }

    h1 { font-size: x-large; font-weight: normal; }
    th { font-weight: bold; text-align: center; }

    div.input { background-color: #FF927A; }
    div.input INPUT[type="text"], div.input SELECT { background-color: #FFF0E0; border: inset 1px #1F1F1F; }
    div.input INPUT[type="text"] { width: 20em; }

    div.output { background-color: #cef0d0; }
    div.output :lang(ja) { font-size: 225%; font-family: serif; }

    .bad { color: #FF0000; font-weight: bold; }

    div.error { background-color: #202020; color: #DC143C; }

    .kana table { margin-left: auto; margin-right: 0; }
    .kana h1 { font-family: serif; text-align: center; }

    /* Styles for the top menu & footer */

    div.menubar, div.footer { text-align: center; }
    div.menubar { margin-top: 20px; }
    div.footer { margin-bottom: 20px; }

    .menubar table { letter-spacing: -1px; margin: 0 auto; width: 100%; }
    .menubar a {
        color: #840000;
        font-size: smaller;
        font-weight: bold;
        text-decoration: none;
    }
    .menubar a.highlight, .menubar a.highlight:hover { color: #630000; }
    .menubar a:hover { color: #FF0000; }
EOF
    my $expires = strftime("%a, %d %b %Y %H:%M:%S GMT", gmtime(time + 86400));
    print("Content-Type: text/css; charset=utf-8\r\n",
          "Cache-Control: max-age=86400, must-revalidate\r\n",
          "Expires: ${expires}\r\n\r\n",
          $out);
}

################################################################################
# Define variables needed by the web interface.
sub define_htmlvars {
    # Define the interface variables
    if (defined $ENV{'SERVER_NAME'} and defined $ENV{'SCRIPT_NAME'}) {
        $SELF = ($ENV{'HTTPS'} ? 'https://' : 'http://') . $ENV{'SERVER_NAME'} .
            $ENV{'SCRIPT_NAME'};
    } else {
        $SELF = $0;
    }

    &get_input(\%FORM);

    # Set the encoding function and output encoding. Unfortunately, the output
    # encoding cannot simply be set to eucjp, shiftjis, etc. because some
    # characters do not map properly. If the &encode_ functions are used
    # instead, Encode::FB_XMLCREF can be set and unmapped characters will be
    # converted into XML codes.
    my %encode = (
        'EUC-JP'      => [ \&encode_eucjp,     ':raw' ],
        'ISO-2022-JP' => [ \&encode_iso2022jp, ':raw' ],
        'ISO-8859-1'  => [ \&encode_iso88591,  ':encoding(iso-8859-1)' ],
        'Shift_JIS'   => [ \&encode_shiftjis,  ':raw' ],
        'UTF-8'       => [ \&encode_utf8,      ':utf8' ],
        );
    if (exists $FORM{'encoding'} and defined $encode{$FORM{'encoding'}}) {
        (*encode, my $mode) = @{$encode{$FORM{'encoding'}}};
        binmode STDOUT, $mode;
        $ENCODING = $FORM{'encoding'};
    } else {
        *encode = \&encode_utf8; # Default
        binmode STDOUT, ':utf8';
        $ENCODING = 'UTF-8';
    }

    # Nice format the modification time of this file.
    $MODIFIED = strftime('%a %Y-%m-%d %H:%M:%S (%Z)',
                         localtime( (stat(DATA))[9] ));
}

################################################################################
# If the purpose of this function is not obvious from the name, then it has been
# poorly named.
sub display_xhtml_footer {
    my $rawtime = (stat(DATA))[9];
    print('<div class="footer">', "\n",
          '    <p>Copyright &copy; 2007 Daniel A. Ramaley<br />', "\n",
          "    Last updated: ${MODIFIED}</p>\n",
          '</div>', "\n",
          '</body>', "\n",
          '</html>', "\n");
}

################################################################################
# The purpose of this function should be obvious from the name. It can
# optionally take a title and a hash reference of headers to send. Extra
# headers will be included both in the actual HTML header and in meta tags in
# the document header.
sub display_xhtml_header {
    my ($highlight, $title, $xtra_headers) = @_ if @_;
    $title = &encode(defined $title ? $title : 'Rōmaji to かな transliterator');
    my $encoding = ($ENCODING ? $ENCODING : 'ISO-8859-1');
    # The Content-Type really ought to be:
    #     application/xhtml+xml; charset=${encoding}
    # But most browsers don't support it. Check the HTTP_ACCEPT environment
    # variable to see if it is supported.
    my %headers = ('Content-Language' => 'en',
                   'Content-Style-Type' => 'text/css',
                   'Content-Type' => "text/html; charset=${encoding}");
    while (my ($key, $val) = each %$xtra_headers) {
        $headers{$key} = $val;
    }

    my $out = '';
    # Build the headers up so they can be sent out with 1 print statement to
    # avoid buffer issues when running under mod_perl.
    $out .= "$_: $headers{$_}\r\n" foreach (sort keys %headers);
    $out .= "\r\n";
    print $out;

    my $lang = $headers{'Content-Language'};
    my $headers = join "\n", map { sprintf('<meta http-equiv="%s" content="%s" />', $_, $headers{$_}) } sort keys %headers;
    ($out = <<EOF) =~ s/^    //gmo;
    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
    <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="${lang}" lang="${lang}">
    <head>
    $headers
    <style type="text/css" media="all"><!-- \@import "${SELF}?action=css"; --></style>
    <!-- Use "Nifty Corners Cube" by Alessandro Fulciniti
         See http://www.html.it/articoli/niftycube/index.html for details. -->
    <script type="text/javascript" src="../css/niftycube.js"></script>
    <script type="text/javascript">window.onload=function(){ Nifty("div","big"); }</script>
    <title>$title</title>
    </head>
    <body>
    <div class="menubar">
        <table><tr>
EOF
    # Print the menu, highlighting whatever is selected.
    foreach ([ 'main',      'Main'               ],
             [ 'about',     'About'              ],
             [ 'kana',      'Kana&nbsp;Chart'    ],
             [ 'syllables', 'Syllable&nbsp;List' ],
             [ 'source',    'Source&nbsp;Code'   ]) {
        my ($action, $text) = @$_;
        my $xtra = ($action eq $highlight ? ' class="highlight"' : '');
        $out .= "    <td><a${xtra} href=\"$SELF?action=${action}&amp;encoding=${ENCODING}\">${text}</a></td>\n";
    }
    $out .= "</tr></table>\n</div>\n";

    print $out;
}

################################################################################
# Format and display error messages.
sub error {
    my ($error) = @_ if @_;
    $error = 'An error has occurred.' if not $error;
    &display_xhtml_header();
    print "<div class=\"error\">${error}</div>\n";
    &display_xhtml_footer;
    exit;
}

################################################################################
# Get CGI form input
sub get_input {
    return if not defined $ENV{'REQUEST_METHOD'};
    my ($hashref) = @_ if @_;
    my $qs;
    if ($ENV{'REQUEST_METHOD'} eq "GET") {
        $qs = $ENV{'QUERY_STRING'};
    }
    elsif ($ENV{'REQUEST_METHOD'} eq "POST") {
        read(STDIN, $qs, $ENV{'CONTENT_LENGTH'});
    }

    foreach (split /&/, $qs) {
        tr/+/ /;
        s/%([0-9A-Fa-f]{2})/pack("c",hex($1))/ge;
        my ($name, $value) = split(/=/, $_, 2);
        if (exists $$hashref{$name}) {
            $$hashref{$name} .= ":${value}";
        } else {
            $$hashref{$name} = $value;
        }
    }
    return 1;
}

################################################################################
# Display the 仮名 charts in standard 五十音 (gojūon "50 sound") order.
sub kana {
    my @gojūon_header =	('p','b','d','z','g','n','w','r','y','m','h','n','t',
                         's','k','','');
    my @yōon_header = ('p','b','d','z','g','r','m','h','n','t','s','k','');

    # ひらがな 五十音
    my $out = "<div class=\"kana\"><h1>ひらがな　の　五十音</h1><table>\n<tr>" .
        join('', map { "<th>$_</th>" } @gojūon_header) . "</tr>\n";
    foreach my $row (@gojūon) {
        $out .= ('<tr>' . join('',
                               map { "<td>$_</td>" }
                               map { defined $$_[1] ? $$_[1] : '' } @$row) .
                 "<th>$$row[-1][0]</th></tr>\n");
    }
    $out .= "</table></div>\n";

    # ひらがな 拗音
    $out .= "<div class=\"kana\"><h1>ひらがな　の　拗音</h1><table>\n<tr>" .
        join('', map { "<th>$_</th>" } @yōon_header) . "</tr>\n";
    foreach my $row (@yōon) {
        $out .= ('<tr>' . join('',
                               map { "<td>$_</td>" }
                               map { defined $$_[1] ? $$_[1] : '' } @$row) .
                 "</tr>\n");
    }
    $out .= "</table></div>\n";

    # カタカナ 五十音
    $out .= "<div class=\"kana\"><h1>カタカナ　の　五十音</h1><table>\n<tr>" .
        join('', map { "<th>$_</th>" } @gojūon_header) . "</tr>\n";
    foreach my $row (@gojūon) {
        $out .= ('<tr>' . join('',
                               map { "<td>$_</td>" }
                               map { defined $$_[2] ? $$_[2] : '' } @$row) .
                 "<th>$$row[-1][0]</th></tr>\n");
    }
    $out .= "</table></div>\n";

    # カタカナ 拗音
    $out .= "<div class=\"kana\"><h1>カタカナ　の　拗音</h1><table>\n<tr>" .
        join('', map { "<th>$_</th>" } @yōon_header) . "</tr>\n";
    foreach my $row (@yōon) {
        $out .= ('<tr>' . join('',
                               map { "<td>$_</td>" }
                               map { defined $$_[2] ? $$_[2] : '' } @$row) .
                 "</tr>\n");
    }
    $out .= "</table></div>\n";

    &display_xhtml_header('kana');
    print &encode($out);
    &display_xhtml_footer;
}

################################################################################
# Displays the main input form and calls the transliteration functions.
sub main {
    # Set variables, with proper encoding taken care of here.
    my ($rōmaji, $trans) = ('', '');
    if (defined $FORM{'romaji'}) {
        $rōmaji = &encode_entities(&encode($FORM{'romaji'}));
        # Transliterate the input and convert the ad-hoc <BAD> codes into XHTML
        foreach (split /(<\/?BAD>)/, &transliterate($FORM{'romaji'})) {
            /<BAD>/   and do { $trans .= '<span class="bad" lang="en" xml:lang="en">'; next; };
            /<\/BAD>/ and do { $trans .= '</span>'; next; };
            $trans .= &encode($_);
        }
    }

    # Build the input section of the page.
    my $o = &encode('ō');
    (my $out = <<EOF) =~ s/^    //gmo;
    <div class="input">
        <form method="get" action="${SELF}">
        <p><input type="hidden" name="action" value="main" />
        R${o}maji: <input type="text" name="romaji" value="${rōmaji}" size="50" /><br />
        Encoding: <select name="encoding">
EOF
    foreach (['UTF-8',       'Unicode (UTF-8)'  ],
             ['ISO-8859-1',  'XHTML &amp;#codes;'],
             ['EUC-JP',      'euc-jp'           ],
             ['Shift_JIS',   'shift_jis'        ],
             ['ISO-2022-JP', 'iso-2022-jp'      ]
    ) {
        my ($val, $text) = @$_;
        my $selected = ($ENCODING eq $val ? ' selected="selected"' : '');
        $out .= "    <option value=\"${val}\"${selected}>${text}</option>\n"
    }
    $out .= ('    </select>'."\n" .
             '    <input type="submit" value="Go!" /></p>'."\n".
             '    </form>'."\n".
             '</div>'."\n");
    # Add the output section if necessary
    $out .= ("<div class=\"output\">\n" .
             "    <p>${rōmaji}:<br />\n" .
             "    <span lang=\"ja\" xml:lang=\"ja\">${trans}</span></p>\n" .
             "</div>\n")
        if $rōmaji;

    &display_xhtml_header('main');
    print $out;
    &display_xhtml_footer;
}

################################################################################
# Display the source code.
sub source {
    # application/x-perl is most proper, but text/plain will allow the source to
    # be viewed directly by most browsers without attempting to download first.
    #my @headers = ('Content-Type: application/x-perl; charset=utf-8');
    my @headers = ('Content-Type: text/plain; charset=utf-8');
    my $size = -s DATA;
    push @headers, "Content-Length: ${size}" if $size;
    binmode STDOUT, ':utf8';
    seek DATA, 0, 0;
    local $/ = undef;
    print join("\r\n", @headers), "\r\n\r\n", <DATA>
}

################################################################################
# Display the list of rōmaji syllables the program can transliterate.
# This routine is a quick hack. It could probably be improved. Areas of most
# needed improvement: memory usage, sort order of characters.
sub syllables {
    my %hiragana = ();
    foreach (map { /^[a-z]/o ? $_ : () } keys %kana) {
        push @{$hiragana{$kana{$_}}}, $_;
    }

    my %katakana = ();
    foreach (map { /^[A-Z]/o ? $_ : () } keys %kana) {
        push @{$katakana{$kana{$_}}}, $_;
    }

    my %symbols = ();
    foreach (map { /^[A-Za-z]/o ? () : $_ } keys %kana) {
        push @{$symbols{$kana{$_}}}, $_;
    }

    (my $out = <<EOF) =~ s/^    //gmo;
    <div class="info">
    <p>This page lists the rōmaji to 仮名 transliteration tables that are used
    internally by the program. It is probably of limited use, but if you want to
    see exactly what characters combinations it is possible to generate, this is
    the right place. Note that corner brackets are not included on this page
    since they are processed separately by the quote handling routine. Please
    note the symbols at the bottom of this page; there are a number of brackets
    that are not well documented elsewhere in the program.</p>
    </div>
    <div class="info">
    <table><tr><th>rōmaji</th><th>ひらがな</th></tr>
EOF
    foreach (sort { length($a) cmp length($b) or $a cmp $b } keys %hiragana) {
        $out .= sprintf("<tr><td>%s</td><td>%s</td></tr>\n",
                        join(', ', sort @{$hiragana{$_}}), $_);
    }
    $out .= "</table>\n</div>\n<div class=\"info\">\n" .
        "<table><tr><th>rōmaji</th><th>カタカナ</th></tr>\n";
    foreach (sort { length($a) cmp length($b) or $a cmp $b } keys %katakana) {
        $out .= sprintf("<tr><td>%s</td><td>%s</td></tr>\n",
                        join(', ', sort @{$katakana{$_}}), $_);
    }
    $out .= "</table>\n</div>\n<div class=\"info\">\n" .
        "<table><tr><th>rōmaji</th><th>Symbols</th></tr>\n";
    foreach (sort keys %symbols) {
        $out .= sprintf("<tr><td>%s</td><td>%s</td></tr>\n",
                        &encode_entities(join(', ', sort @{$symbols{$_}})),
                        $_);
    }
    $out .= "</table>\n</div>\n";

    &display_xhtml_header('syllables');
    print &encode($out);
    &display_xhtml_footer;
}

# Do not remove the __DATA__ line; it is used to print the source code.
__DATA__

TODO:

make the form accept non-ascii characters such as ū
make it work under mod_perl