#!/usr/bin/perl # # WARNING: This file contains UTF-8 Unicode characters. Please take any # necessary precautions to avoid corruption when transmitting this file. # # Rōmaji to Kana transliterator # Copyright (C) 2007 Daniel A. Ramaley # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # or visit http://www.gnu.org/copyleft/gpl.txt. # # # Documentation about what this program does is provided in the &about function. use v5.8.1; # Make sure Unicode support is decent use warnings 'all'; use strict; use charnames ':full'; # Allow English names for Unicode characters local *vianame = \&charnames::vianame; # Alias this because it is used a lot. use utf8; # Needed if the source file contains Unicode use Encode qw(); use HTML::Entities qw(&encode_entities); use POSIX qw(&strftime); # These variables are used by the transliteration process. The first four # contain static data and are created by &define_vars: # @gojūon - Holds the 50-sounds chart so it can be easily displayed. # %kana - Hash table to hold 仮名 mappings. # $kana_regex - Used to break Latin strings into rōmaji syllables. # %preprocess - Hash table to hold Latin -> Latin mappings that are applied # prior to the rōmaji -> 仮名 conversion. # $pre_regex - Similar function as $kana_regex, but for %preprocess. # @yōon - Holds the yōon chart so it can be easily displayed # The last variable must be emptied for each run: # @quote_stack - Used by &special. our (@gojūon, %kana, $kana_regex, %preprocess, $pre_regex, @yōon); our @quote_stack = (); &define_vars unless %kana; # These variables are used by the XHTML interface side of the script. our %FORM = (); # Form input our $ENCODING = undef; # Output encoding our $SELF; # URL for this script our $MODIFIED; # Date this file was last changed &define_htmlvars; if (defined $FORM{'action'}) { # Use a dispatch table instead of a messy if/elsif condition. my %actions = ('about' => \&about, 'css' => \&css, 'kana' => \&kana, 'main' => \&main, 'source' => \&source, 'syllables' => \&syllables, ); &error('Bad input given.') if not defined $actions{$FORM{'action'}}; &{$actions{$FORM{'action'}}}; } else { &main; } exit; ################################################################################ ################################################################################ #### Functions to perform ローマ字 to 仮名 conversion #### ################################################################################ ################################################################################ ################################################################################ # This function builds a 3-dimensional array that contains the 五十音 (gojūon) # chart in 縦書き (tategaki "vertical writing") ordering. The 3rd dimension is # used to store all of the rōmaji, ひらがな, and カタカナ in one array. # This must be called after the standard gojūon have been defined in %kana # but before the non-standard additions have been added. sub define_gojūon { my @headings = ('p','b','d','z','g','#','w','r','y','m','h','n','t','s','k',''); my @vowels = ('a', 'i', 'u', 'e', 'o'); foreach my $row (@vowels) { my @thisrow = (); foreach my $col (@headings) { if ($col eq '#') { push @thisrow, ($row eq 'a' ? ['n', 'ん', 'ン'] : [(undef) x 3]); } else { my $syl = $col . $row; push @thisrow, [ $syl, $kana{$syl}, $kana{uc $syl} ]; } } push @gojūon, \@thisrow; } } ################################################################################ # Programmatically define the 仮名. Unicode character names are used instead # of hex codes for easier reading and debugging, at the expense of execution # speed. sub define_vars { # Variables used in this function. Don't touch. my @vowels = ('a', 'i', 'u', 'e', 'o'); # Standard 仮名 ############### # Small letters foreach (@vowels, 'ka', 'ke', 'tu', 'ya', 'yu', 'yo', 'wa') { my $uc = uc; $kana{"x${_}"} = sprintf '%c', &vianame("HIRAGANA LETTER SMALL ${uc}"); $kana{"X${uc}"} = sprintf '%c', &vianame("KATAKANA LETTER SMALL ${uc}"); } # These small カタカナ are only used by the アイヌ語 (Ainu language) but are # included here for completeness. foreach ('KU', 'SI', 'SU', 'TO', 'NU', 'HA', 'HI', 'HU', 'HE', 'HO', 'MU', 'RA', 'RI', 'RU', 'RE', 'RO') { $kana{"X${_}"} = sprintf '%c', &vianame("KATAKANA LETTER SMALL ${_}"); } # Main 五十音 (gojūon) 仮名. Also 濁点 (dakuten) and 半濁点 (handakuten) foreach my $consonant ('','k','s','t','n','h','m','y','r','w', # 五十音 'g', 'z', 'd', 'b', 'p') { # 濁点、半濁点 foreach my $vowel (@vowels) { my $cv = $consonant . $vowel; my $uc = uc $cv; my ($hiragana, $katakana); $kana{$cv} = sprintf '%c', $hiragana if $hiragana = &vianame("HIRAGANA LETTER ${uc}"); $kana{$uc} = sprintf '%c', $katakana if $katakana = &vianame("KATAKANA LETTER ${uc}"); } } # ん is hard to define with the above loop: @kana{'n', 'N'} = ('ん', 'ン'); # This needs to be done here after the main 五十音 have been defined and # before any extra definitions have been added (such as non-existent "wu"). &define_gojūon; # This one can be done later, but it is here because it is related to the # &define_gojūon call. &define_yōon; # Another way to access archaic ゐ and ゑ: @kana{'wyi', 'WYI', 'wye', 'WYE'} = ('ゐ', 'ヰ', 'ゑ', 'ヱ'); # Non-standard syllables to complete the chart: @kana{'yi','YI','ye','YE','wu','WU'} = ('い','イ','いぇ','イェ','う','ウ'); # Redefine カタカナ ゐ (wi) ゑ (we) and ヲ (wo) to reflect modern usage. # The last one provides access to ヲ that was redefined. However, it is # *very* nonstandard (i've never seen it anywhere else). @kana{'WI', 'WE', 'WO', 'WYO'} = ('ウィ', 'ウェ', 'ウォ', 'ヲ'); # Add 拗音 (Yōon) syllables. Note that "yi" and "ye" forms are seldom used. foreach my $consonant ('k','s','t','n','h','m','r','g','z','d','b','p') { my $uc = uc $consonant; my ($hira, $kata) = @kana{"${consonant}i", "${uc}I"}; @kana{map { "${consonant}y${_}" } @vowels} = map {"${hira}${_}"} ('ゃ', 'ぃ', 'ゅ', 'ぇ', 'ょ'); @kana{map { "${uc}Y" . uc $_ } @vowels} = map {"${kata}${_}"} ('ャ', 'ィ', 'ュ', 'ェ', 'ョ'); } # Alias "l" to be the same as "r" foreach my $r (grep /^r/io, keys %kana) { (my $l = $r) =~ tr/Rr/Ll/; $kana{$l} = $kana{$r}; } # Seldom-used and less standard combinations ############################################ # Note that a few definitions below are actually quite common, "tsu" and # "fu" for instance. # Combinations that use small vowels @kana{'vu', 'VU'} = ('ゔ', 'ヴ'); # Initialize these so the loop will work. foreach (['v', 'vu'], ['kh', 'ku'], ['kw', 'ku'], ['q', 'ku'], ['sw', 'su'], ['ts', 'tu'], ['tw', 'to'], ['f', 'hu'], ['mh', 'mu'], ['mw', 'mu'], ['wh', 'u' ], ['gh', 'gu'], ['gw', 'gu'], ['zh', 'zu'], ['zw', 'zu'], ['dw', 'do'], ['bh', 'ba'], ['bw', 'bi'], ['ph', 'pu'], ['pw', 'pu']) { my ($r, $consonant) = @$_; my ($hira, $kata) = @kana{$consonant, uc ($consonant)}; @kana{map { "${r}${_}" } @vowels} = map { "${hira}${_}" } ('ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ'); @kana{map { uc "${r}${_}" } @vowels} = map { "${kata}${_}" } ('ァ', 'ィ', 'ゥ', 'ェ', 'ォ'); } # Exceptions to the above that use ゎ (small wa) @kana{'kwa', 'KWA', 'mwa', 'MWA', 'gwa', 'GWA', 'zwa', 'ZWA', 'bwa', 'BWA', 'pwa', 'PWA'} = ('くゎ', 'クヮ', 'むゎ', 'ムヮ', 'ぐゎ', 'グヮ', 'ずゎ', 'ズヮ', 'びゎ', 'ビヮ', 'ぷゎ', 'プヮ'); # Exceptions that map to just 1 character @kana{'tsu', 'TSU', 'fu', 'FU', 'whu', 'WHU', 'vu', 'VU'} = ('つ', 'ツ', 'ふ', 'フ', 'う', 'ウ', 'ゔ', 'ヴ'); # Note that the syllables VA, VI, VE, VO exist in カタカナ, but have been # replaced in modern usage with the combinations VU plus a small vowel. If # the older V syllables are desired, they have been defined here with the # X prefix. Note that this is *very* nonstandard: @kana{'XVA', 'XVI', 'XVU', 'XVE', 'XVO'} = ('ヷ', 'ヸ', 'ヴ', 'ヹ', 'ヺ'); # Combinations that use small ya, etc. foreach (['vy', 'vu'], ['sh', 'si'], ['j', 'zi'], ['jy', 'zi'], ['th', 'te'], ['ch', 'ti'], ['cy', 'ti'], ['fy', 'hu'], ['dh', 'de']) { my ($r, $consonant) = @$_; my ($hira, $kata) = @kana{$consonant, uc ($consonant)}; @kana{map { "${r}${_}" } @vowels} = map { "${hira}${_}" } ('ゃ', 'ぃ', 'ゅ', 'ぇ', 'ょ'); @kana{map { uc "${r}${_}" } @vowels} = map { "${kata}${_}" } ('ャ', 'ィ', 'ュ', 'ェ', 'ョ'); } # Exceptions that map to just 1 character @kana{'shi','SHI','ji','JI','chi','CHI'} = ('し','シ','じ','ジ','ち','チ'); # Bizarre aliases @kana{'ca', 'ci', 'cu', 'ce', 'co'} = @kana{'ka', 'si', 'ku', 'se', 'ko'}; @kana{'CA', 'CI', 'CU', 'CE', 'CO'} = @kana{'KA', 'SI', 'KU', 'SE', 'KO'}; @kana{'jhe', 'JHE', 'dzi', 'DZI'} = @kana{'za', 'ZA', 'di', 'DI'}; # Rare "ŋ" or "ng" sound formed by combining K consonants with 半濁点 # These are preceded with "x" to avoid collision with the far more common # "んご" sequence. my $半濁点 = "\N{COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK}"; @kana{map { "xng${_}" } @vowels} = map { $kana{"k${_}"} . $半濁点 } @vowels; @kana{map { 'XNG' . uc($_) } @vowels} = map { $kana{'K' . uc ($_)} . $半濁点 } @vowels; # Numerals and punctuation ########################## # Numerals @kana{'0','1','2','3','4','5','6','7','8','9'} = ('0','1','2','3','4','5','6','7','8','9'); # Punctuation. Some of these are not traditionally used. @kana{' ',',','.','!','?',';',':'} = (' ','、','。','!','?',';',':'); @kana{'~','#','$','%','&','@','-'} = ('〜','#','$','%','&','@','ー'); @kana{'*','+','/','\\','='} = ('*','+','/','\','='); @kana{'^','_','|'} = ('^','_','|'); @kana{'..','...'} = ('‥','……'); # Traditional Japanese ellipsis is 6 dots # Quotation marks # Note: Distinguish opening and closing quotes is done with logic in the # &special function which is called when a character sequence is not # recognized. So do not define quotes here. #@kana{'"','"','""','""'} = ('「','」','『','』'); # Braces. # Note that most of these definitions are *very* non-standard and were # chosen such that it is possible to type any of the braces, even if they # are seldom used. Commonly used braces should work as expected. @kana{'(', ')', '{', '}', '[', ']', '[[', ']]', '<', '>', '<<', '>>', '[(', ')]', '[{', '}]', '([', '])', '({', '})'} = ('(',')','{','}', # Parenthesis, Curly Bracket '[',']','〚','〛', # Square Bracket, White Square Bracket '〈','〉','《','》', # Angle Bracket, Double Angle Bracket '【','】','〖','〗', # Black Lenticular Bracket, White Lenticular Bracket '〔','〕','〘','〙', # Tortoise Shell Bracket, White Tortoise Shell Bracket ); # Regex definition ################## # This regular expression is used to split the input string into syllables. # Though this regex is quite large, the structure of it is very simple, with # the interesting stuff happening in &$regex_sort. $kana_regex = join '|', map { quotemeta } sort regex_sort keys %kana; $kana_regex = qr/($kana_regex)/o; # Preprocess definition ####################### # %preprocess is used in case of circumflex or macron characters in the # input. Most rōmaji systems use these characters to represent long vowels. @preprocess{'Â','Î','Û','Ê','Ô','Ā','Ī','Ū','Ē','Ō'} = ('A-','I-','U-','E-','O-') x 2; @preprocess{'â','î','û','ê','ô','ā','ī','ū','ē','ō'} = ('aa','ii','uu','ei','ou') x 2; @preprocess{'Ŋ', 'ŋ'} = ('XNG', 'xng'); @preprocess{"N\N{COMBINING MACRON}","n\N{COMBINING MACRON}"} = ("N'","n'"); $pre_regex = join '|', sort keys %preprocess; $pre_regex = qr/($pre_regex)/o; } ################################################################################ # This function builds a 3-dimensional array that containing the 拗音 (yōon) # chart in 縦書き (tategaki "vertical writing") ordering. The 3rd dimension is # used to store all of the rōmaji, ひらがな, and カタカナ in one array. sub define_yōon { foreach my $row ('a', 'u', 'o') { my @thisrow = (); foreach my $col ('p','b','d','z','g','r','m','h','n','t','s','k') { push @thisrow, ["${col}y${row}", $kana{$col.'i'}.$kana{'xy'.$row}, $kana{uc $col.'I'}.$kana{'XY'.uc $row}]; } push @yōon, \@thisrow; } } ################################################################################ # Used to sort syllables for use in a regex. The sort order should be such that # syllables are matched correctly, with a secondary emphasis on performance of # the regular expression parser. # # Please note that it is likely that my understanding of the inner workings of # the regular expression parser is incomplete or incorrect and there may be a # more efficient way to order these. I do not claim that this is the *best* way, # just that it should work and not give abysmal performance. sub regex_sort { # Organize entries loosely by how often entries are likely to be # encountered: put lowercase characters (ひらがな) first followed by # uppercase (カタカナ) and then symbols. ($b =~ /^[a-z]/o) <=> ($a =~ /^[a-z]/o) or ($b =~ /^[A-Z]/o) <=> ($a =~ /^[A-Z]/o) or # Group entries with the same initial characters together to help the # regex parser skip groups of non-matches more quickly substr($a, 0, 1) cmp substr($b, 0, 1) or # Then reverse sort by length. This is done, for example, so "na" has a # chance to match before "n" and "..." will be matched as 1 entry # instead of three occurences of "." length($b) cmp length($a) or # And finally by ASCII value (mostly just to ensure a predictable order) $a cmp $b } ################################################################################ # Handle character sequences that were not found in %kana. This is where Latin # quotes are converted into angle brackets. Completely unrecognized sequences # are returned with tags so that they may be displayed differently. sub special { my ($chars) = @_ if @_; my %brackets = ('"' => ['「', '」'], '""' => ['『', '』']); my @chunks = (map { length($_) ? $_ : () } # Eat empty strings from split split /(\"\"|\")/o, $chars); # Split into chunks of quotes my $returnval = ''; foreach (@chunks) { if (exists $brackets{$_}) { if (@quote_stack and $quote_stack[-1] eq $_) { # Close an open set of brackets $returnval .= ${$brackets{$_}}[1]; pop @quote_stack; } else { # Start a new pair of brackets $returnval .= ${$brackets{$_}}[0]; push @quote_stack, $_; } } else { # This chunk was not recognized! $returnval .= "$_"; } } return $returnval; } ################################################################################ # This function's purpose is very simple: take a string of ローマ字 and return # 仮名. This is done in 3 passes. The first normalizes the input by converting # double consonants and macrons into their longer forms. Pass 2 identifies what # parts of the input can be processed and converts recognizable chunks into # 仮名 while tagging unrecognizable chunks. Pass 3 does some final cleanup on # the output and may be easily commented out if so desired. sub transliterate { my ($input) = @_ if @_; my $output; ########## # PASS 1 # ########## # Convert doubled consonants into "xtu". This has to be done before long # vowel conversion so that ŋ and Ŋ are handled correctly. $input =~ s/([bcdfghjklmnpqrstvwxyzŋ])\1/xtu$1/go; $input =~ s/([BCDFGHJKLMNPQRSTVWXYZŊ])\1/XTU$1/go; # Convert macron long vowels into explicit doubled vowels. $input =~ s/$pre_regex/$preprocess{$1}/ego; ########## # PASS 2 # ########## $output = join '', ( # This line is where rōmaji finally becomes 仮名. map { exists $kana{$_} ? $kana{$_} : &special($_) } # Eat empty strings added by split map { length($_) ? $_ : () } # Break the string into chunks of recognized characters map { split /$kana_regex/o } #Consider enabling the next line and then at this position add an "ng$ -> n" #mapping. possibly also add "s$ -> su" and some other whispered vowels. # Break string into chunks of uppercase, lowercase, and other characters #map { m/([a-zŋ]+|[A-ZŊ]+|[^a-zŋA-ZŊ]+)/go } # Single quote is used to differentiate にゃ (nya) from んや (n'ya) split /\'/o, $input ); ########## # PASS 3 # ########## # Add middle dots between カタカナ words. 1 while ($output =~ s/(\p{inKatakana})\s(\p{inKatakana})/$1・$2/go); return $output; } ################################################################################ ################################################################################ #### Encoding functions #### ################################################################################ ################################################################################ ################################################################################ # Convert to EUC-JP sub encode_eucjp { my $val = $_[0]; return &Encode::encode('euc-jp', $val, Encode::FB_XMLCREF); } ################################################################################ # Convert to ISO-2022-JP sub encode_iso2022jp { my $val = $_[0]; return &Encode::encode('iso-2022-jp-1', $val, Encode::FB_XMLCREF); } ################################################################################ # Encode non-ASCII Unicode characters with their Unicode character number. sub encode_iso88591 { # Do not use &encode_entities here because it also encodes quote characters return join('', map { ord > 127 ? sprintf('&#x%x;', ord) : $_ } split //, $_[0]); } ################################################################################ # Convert to Shift_JIS sub encode_shiftjis { my $val = $_[0]; return &Encode::encode('shiftjis', $val, Encode::FB_XMLCREF); } ################################################################################ # UTF-8 encoding is trivial since it is Perl's native format sub encode_utf8 { return $_[0]; } ################################################################################ ################################################################################ #### Interface functions #### ################################################################################ ################################################################################ ################################################################################ # Print the help/about page sub about { (my $out = <This program accepts input in ローマ字 (rōmaji) and produces the equivalent ひらがな (hiragana) and カタカナ (katakana) output. This program does not map arbitrary English phonemes into the Japanese sound system, nor does it translate from English to Japanese.

Transliteration is done by use of an associative array (hash table) that contains mappings from syllables written in Latin letters to 仮名 (kana). ひらがな may be produced by entering lowercase letters, カタカナ by using uppercase. Some punctuation is also allowed, though the Romanization for the lesser-used brackets is quite non-standard.

Some attempt has been made to simultaneously support the most common Romanization systems (修正ヘボン式ローマ字 (Modified Hepburn), 訓令式ローマ字 (Kunrei-siki), and 日本式ローマ字 (Nihon-siki), though completely meeting that goal is impossible. Where different systems conflict, precedence has been given to 日本式ローマ字 since it is the most regular and has a 1-to-1 relation between 仮名 and rōmaji.

Unrecognized characters in the input will be returned unprocessed but highlighted in a different color.

Quotes

Apostrophes (single quotes) are used to differentiate ん (syllabic n) from a doubled consonant when it appears before one of な に ぬ ね の. They are similarly used to prevent ん from binding with vowels and y syllables. For example, 「んな」, 「んあ」, and 「んや」 ("n'na", "n'a", and "n'ya") all require an apostrophe, while 「っな」, 「な」, and 「にゃ」 ("nna", "na", and "nya") do not.

An attempt has been made to translate Latin quotes properly such that regular double and single quotes as are used in English will be converted into left and right corner brackets (「」『』) as appropriate. However, there is a peculiarity with the way that white corner brackets (『 and 』) are represented:

Normally single quotes would be used to represent white corner brackets, but this is not possible since single quotes have already been used as described previously. Instead, white corner brackets are represented by doubled double quotes (""). This is admittedly non-standard and may change if a better solution is discovered.

A limitation of the quote translation is that quotes must be balanced. That is, if a set of regular corner brackets is opened, it is necessary to close it before the next set of the same type of bracket may be opened. For example, this ordering of brackets is possible: 「『』『』」「」 (and may be generated with this rather improbable sequence: "\'""\'""\'""\'""\'"\'"\'") while this one is not: 「『」』.

X Characters

Some characters are difficult to represent in rōmaji. These characters\' Romanization has been preceded with an "x". The cases where "x" is used are:

Output

Several output encodings are available. UTF-8 is the preferred encoding, and all modern web browsers should natively support it. ISO-8859-1 encoding with XHTML &#codes; is provided as a second option for older browsers that do not properly handle Unicode. In addition, the following legacy encodings are provided: EUC-JP, Shift_JIS, ISO-2022-JP. Please note that the legacy encodings have not been as thoroughly tested.

All pages except the Source Code page inherit the encoding set on the main page. The Source Code page always uses UTF-8 since that is what the script is actually encoded in and a different encoding may break it.

Meta

The idea for this program was borrowed from Joel Yliluoma (Bisqwit)'s Romaji to hiragana converter. Though the source for Bisqwit's program is generously provided, no actual code was borrowed.

The main reasons for writing this program were to gain a deeper understanding of rōmaji to 仮名 transliteration, to play with Unicode and other encoding methods for non-English languages, to work more with 仮名, and to start working with 漢字 (kanji) (the latter only appears in the comments and documentation for now).

The previously linked Wikipedia articles, as well as several others linked from the Japanese Writing entry were helpful in researching how transliteration is done. The Kotoeri input method used on Mac OS X was also helpful in deciding how to deal with various edge cases.

EOF &display_xhtml_header('about'); print '
', &encode($out), '
'; &display_xhtml_footer; } ################################################################################ # Return the CSS document. sub css { (my $out = <<'EOF') =~ s/^ //gmo; @charset "utf-8"; body { margin: 0 auto; padding: 0; font-family: sans-serif; background-color: #FFF0E0; color: #1F1F1F; } div { width: 35em; padding: 20px 30px; margin: 5px auto; background-color: #FFE7C6; text-align: justify; } h1 { font-size: x-large; font-weight: normal; } th { font-weight: bold; text-align: center; } div.input { background-color: #FF927A; } div.input INPUT[type="text"], div.input SELECT { background-color: #FFF0E0; border: inset 1px #1F1F1F; } div.input INPUT[type="text"] { width: 20em; } div.output { background-color: #cef0d0; } div.output :lang(ja) { font-size: 225%; font-family: serif; } .bad { color: #FF0000; font-weight: bold; } div.error { background-color: #202020; color: #DC143C; } .kana table { margin-left: auto; margin-right: 0; } .kana h1 { font-family: serif; text-align: center; } /* Styles for the top menu & footer */ div.menubar, div.footer { text-align: center; } div.menubar { margin-top: 20px; } div.footer { margin-bottom: 20px; } .menubar table { letter-spacing: -1px; margin: 0 auto; width: 100%; } .menubar a { color: #840000; font-size: smaller; font-weight: bold; text-decoration: none; } .menubar a.highlight, .menubar a.highlight:hover { color: #630000; } .menubar a:hover { color: #FF0000; } EOF my $expires = strftime("%a, %d %b %Y %H:%M:%S GMT", gmtime(time + 86400)); print("Content-Type: text/css; charset=utf-8\r\n", "Cache-Control: max-age=86400, must-revalidate\r\n", "Expires: ${expires}\r\n\r\n", $out); } ################################################################################ # Define variables needed by the web interface. sub define_htmlvars { # Define the interface variables if (defined $ENV{'SERVER_NAME'} and defined $ENV{'SCRIPT_NAME'}) { $SELF = ($ENV{'HTTPS'} ? 'https://' : 'http://') . $ENV{'SERVER_NAME'} . $ENV{'SCRIPT_NAME'}; } else { $SELF = $0; } &get_input(\%FORM); # Set the encoding function and output encoding. Unfortunately, the output # encoding cannot simply be set to eucjp, shiftjis, etc. because some # characters do not map properly. If the &encode_ functions are used # instead, Encode::FB_XMLCREF can be set and unmapped characters will be # converted into XML codes. my %encode = ( 'EUC-JP' => [ \&encode_eucjp, ':raw' ], 'ISO-2022-JP' => [ \&encode_iso2022jp, ':raw' ], 'ISO-8859-1' => [ \&encode_iso88591, ':encoding(iso-8859-1)' ], 'Shift_JIS' => [ \&encode_shiftjis, ':raw' ], 'UTF-8' => [ \&encode_utf8, ':utf8' ], ); if (exists $FORM{'encoding'} and defined $encode{$FORM{'encoding'}}) { (*encode, my $mode) = @{$encode{$FORM{'encoding'}}}; binmode STDOUT, $mode; $ENCODING = $FORM{'encoding'}; } else { *encode = \&encode_utf8; # Default binmode STDOUT, ':utf8'; $ENCODING = 'UTF-8'; } # Nice format the modification time of this file. $MODIFIED = strftime('%a %Y-%m-%d %H:%M:%S (%Z)', localtime( (stat(DATA))[9] )); } ################################################################################ # If the purpose of this function is not obvious from the name, then it has been # poorly named. sub display_xhtml_footer { my $rawtime = (stat(DATA))[9]; print('', "\n", '', "\n", '', "\n"); } ################################################################################ # The purpose of this function should be obvious from the name. It can # optionally take a title and a hash reference of headers to send. Extra # headers will be included both in the actual HTML header and in meta tags in # the document header. sub display_xhtml_header { my ($highlight, $title, $xtra_headers) = @_ if @_; $title = &encode(defined $title ? $title : 'Rōmaji to かな transliterator'); my $encoding = ($ENCODING ? $ENCODING : 'ISO-8859-1'); # The Content-Type really ought to be: # application/xhtml+xml; charset=${encoding} # But most browsers don't support it. Check the HTTP_ACCEPT environment # variable to see if it is supported. my %headers = ('Content-Language' => 'en', 'Content-Style-Type' => 'text/css', 'Content-Type' => "text/html; charset=${encoding}"); while (my ($key, $val) = each %$xtra_headers) { $headers{$key} = $val; } my $out = ''; # Build the headers up so they can be sent out with 1 print statement to # avoid buffer issues when running under mod_perl. $out .= "$_: $headers{$_}\r\n" foreach (sort keys %headers); $out .= "\r\n"; print $out; my $lang = $headers{'Content-Language'}; my $headers = join "\n", map { sprintf('', $_, $headers{$_}) } sort keys %headers; ($out = < $headers $title \n"; print $out; } ################################################################################ # Format and display error messages. sub error { my ($error) = @_ if @_; $error = 'An error has occurred.' if not $error; &display_xhtml_header(); print "
${error}
\n"; &display_xhtml_footer; exit; } ################################################################################ # Get CGI form input sub get_input { return if not defined $ENV{'REQUEST_METHOD'}; my ($hashref) = @_ if @_; my $qs; if ($ENV{'REQUEST_METHOD'} eq "GET") { $qs = $ENV{'QUERY_STRING'}; } elsif ($ENV{'REQUEST_METHOD'} eq "POST") { read(STDIN, $qs, $ENV{'CONTENT_LENGTH'}); } foreach (split /&/, $qs) { tr/+/ /; s/%([0-9A-Fa-f]{2})/pack("c",hex($1))/ge; my ($name, $value) = split(/=/, $_, 2); if (exists $$hashref{$name}) { $$hashref{$name} .= ":${value}"; } else { $$hashref{$name} = $value; } } return 1; } ################################################################################ # Display the 仮名 charts in standard 五十音 (gojūon "50 sound") order. sub kana { my @gojūon_header = ('p','b','d','z','g','n','w','r','y','m','h','n','t', 's','k','',''); my @yōon_header = ('p','b','d','z','g','r','m','h','n','t','s','k',''); # ひらがな 五十音 my $out = "

ひらがな の 五十音

\n" . join('', map { "" } @gojūon_header) . "\n"; foreach my $row (@gojūon) { $out .= ('' . join('', map { "" } map { defined $$_[1] ? $$_[1] : '' } @$row) . "\n"); } $out .= "
$_
$_$$row[-1][0]
\n"; # ひらがな 拗音 $out .= "

ひらがな の 拗音

\n" . join('', map { "" } @yōon_header) . "\n"; foreach my $row (@yōon) { $out .= ('' . join('', map { "" } map { defined $$_[1] ? $$_[1] : '' } @$row) . "\n"); } $out .= "
$_
$_
\n"; # カタカナ 五十音 $out .= "

カタカナ の 五十音

\n" . join('', map { "" } @gojūon_header) . "\n"; foreach my $row (@gojūon) { $out .= ('' . join('', map { "" } map { defined $$_[2] ? $$_[2] : '' } @$row) . "\n"); } $out .= "
$_
$_$$row[-1][0]
\n"; # カタカナ 拗音 $out .= "

カタカナ の 拗音

\n" . join('', map { "" } @yōon_header) . "\n"; foreach my $row (@yōon) { $out .= ('' . join('', map { "" } map { defined $$_[2] ? $$_[2] : '' } @$row) . "\n"); } $out .= "
$_
$_
\n"; &display_xhtml_header('kana'); print &encode($out); &display_xhtml_footer; } ################################################################################ # Displays the main input form and calls the transliteration functions. sub main { # Set variables, with proper encoding taken care of here. my ($rōmaji, $trans) = ('', ''); if (defined $FORM{'romaji'}) { $rōmaji = &encode_entities(&encode($FORM{'romaji'})); # Transliterate the input and convert the ad-hoc codes into XHTML foreach (split /(<\/?BAD>)/, &transliterate($FORM{'romaji'})) { // and do { $trans .= ''; next; }; /<\/BAD>/ and do { $trans .= ''; next; }; $trans .= &encode($_); } } # Build the input section of the page. my $o = &encode('ō'); (my $out = <

R${o}maji:
Encoding: '."\n" . '

'."\n". '
'."\n". ''."\n"); # Add the output section if necessary $out .= ("
\n" . "

${rōmaji}:
\n" . " ${trans}

\n" . "
\n") if $rōmaji; &display_xhtml_header('main'); print $out; &display_xhtml_footer; } ################################################################################ # Display the source code. sub source { # application/x-perl is most proper, but text/plain will allow the source to # be viewed directly by most browsers without attempting to download first. #my @headers = ('Content-Type: application/x-perl; charset=utf-8'); my @headers = ('Content-Type: text/plain; charset=utf-8'); my $size = -s DATA; push @headers, "Content-Length: ${size}" if $size; binmode STDOUT, ':utf8'; seek DATA, 0, 0; local $/ = undef; print join("\r\n", @headers), "\r\n\r\n", } ################################################################################ # Display the list of rōmaji syllables the program can transliterate. # This routine is a quick hack. It could probably be improved. Areas of most # needed improvement: memory usage, sort order of characters. sub syllables { my %hiragana = (); foreach (map { /^[a-z]/o ? $_ : () } keys %kana) { push @{$hiragana{$kana{$_}}}, $_; } my %katakana = (); foreach (map { /^[A-Z]/o ? $_ : () } keys %kana) { push @{$katakana{$kana{$_}}}, $_; } my %symbols = (); foreach (map { /^[A-Za-z]/o ? () : $_ } keys %kana) { push @{$symbols{$kana{$_}}}, $_; } (my $out = <

This page lists the rōmaji to 仮名 transliteration tables that are used internally by the program. It is probably of limited use, but if you want to see exactly what characters combinations it is possible to generate, this is the right place. Note that corner brackets are not included on this page since they are processed separately by the quote handling routine. Please note the symbols at the bottom of this page; there are a number of brackets that are not well documented elsewhere in the program.

EOF foreach (sort { length($a) cmp length($b) or $a cmp $b } keys %hiragana) { $out .= sprintf("\n", join(', ', sort @{$hiragana{$_}}), $_); } $out .= "
rōmajiひらがな
%s%s
\n
\n
\n" . "\n"; foreach (sort { length($a) cmp length($b) or $a cmp $b } keys %katakana) { $out .= sprintf("\n", join(', ', sort @{$katakana{$_}}), $_); } $out .= "
rōmajiカタカナ
%s%s
\n
\n
\n" . "\n"; foreach (sort keys %symbols) { $out .= sprintf("\n", &encode_entities(join(', ', sort @{$symbols{$_}})), $_); } $out .= "
rōmajiSymbols
%s%s
\n
\n"; &display_xhtml_header('syllables'); print &encode($out); &display_xhtml_footer; } # Do not remove the __DATA__ line; it is used to print the source code. __DATA__ TODO: make the form accept non-ascii characters such as ū make it work under mod_perl