diff -urN perl5.004_02/Changes.jp jperl5.004_02/Changes.jp --- perl5.004_02/Changes.jp Thu Jan 1 09:00:00 1970 +++ jperl5.004_02/Changes.jp Fri Aug 8 10:10:23 1997 @@ -0,0 +1,159 @@ +Fri Aug 8 10:08:21 1997 WATANABE Hirofumi + + * jperl5.004_02.pat: + + * perl5.004_02 対応. + +Thu Aug 7 13:00:23 1997 WATANABE Hirofumi + + * jperl5.004_01_03.pat: + + * perl5.004_01_03 対応. + + * $_ = 'abc0123"; tr/0-9/a/c; が "aaa0123" にならないバグを修正. + bug report ありがとう新井康司さん. + +Tue Aug 5 18:28:32 1997 WATANABE Hirofumi + + * jperl5.004_01_02.pat: + + * perl5.004_01_02 対応. + +Thu Jul 10 19:38:40 1997 WATANABE Hirofumi + + * jperl5.004_01-970710.pat: + + * kanji.c で strcasecmp() を使うのをやめて jstricmp() を使うよう + にした. + + * no I18N::Japanese; で $^H &= ~0xf000 になるようにした. + qw(string) も独立させた. これで jcode.pl の頭に + no I18N::Japanese; を入れれば jperl で使えるようになる. + + * win32/Makefile に kanji.[co] を追加. + +Tue Jun 17 13:22:23 1997 WATANABE Hirofumi + + * jperl5.004_01-970617.pat: + + * perl5.004_01 対応. + +Mon May 26 11:01:49 1997 WATANABE Hirofumi + + * jperl5.004-970526.pat: + + * jcode.pl 対応. 文字クラス中の 16/8 進リテラルは 1 byte 単位で + アクセスすることした. いままでは 1 byte でも日本語 1 文字でもな + いので, 全く機能してなかった. + + * tr も同じ. + +Sun May 18 17:49:11 1997 WATANABE Hirofumi + + * jperl5.004-970518.pat: + + * perl5.004 Release 対応. + +Fri May 16 11:26:35 1997 WATANABE Hirofumi + + * jperl5.004RC2-970516.pat: + + * perl5.004 Release Candidate 2 対応. + +Mon May 12 09:55:02 1997 WATANABE Hirofumi + + * jperl5.003_99a-970512.pat: + + * perl5.003_99a 対応. + +Wed May 7 12:40:05 1997 WATANABE Hirofumi + + * jperl5.003_99-970507.pat: + + * bit vector を 256+4096 に縮小. + + * bit vector へのポインタの alignment を 4 バイト境界にした. + +Tue May 6 12:15:45 1997 WATANABE Hirofumi + + * regclass(), reginclass() を bit vector(8192 bytes)にして高速化. + +Sun May 4 17:50:14 1997 WATANABE Hirofumi + + * jperl5.003_99-970504.pat: + + * \Q, quotemeta() が日本語対応してなかった. + +Sat May 3 22:29:35 1997 WATANABE Hirofumi + + * perl.5003_99 対応. + +Thu May 1 19:29:21 1997 WATANABE Hirofumi + + * jperl5.003_98-970501.pat: + + * perl5.003_98 対応. + +Sun Apr 27 22:18:08 1997 WATANABE Hirofumi + + * jperl5.003_97i-970427.pat: + + * regdump() の結果がめちゃくちゃになるので ANYOF_ENDMARK は 99 に + した. ついでに ANYOF で範囲を表示するようにした. + + * KANJI MODE じゃないとき, ANY, SANY, ANYOF は SIMPLE になるように + した. + +Fri Apr 25 12:12:09 1997 WATANABE Hirofumi + + * jperl5.003_97h-970425.pat: + + * patch 2.2 の -p の仕様が変更になったので, man + patch で推奨されてる形式にした. + +Wed Apr 23 12:25:16 1997 WATANABE Hirofumi + + * jperl5.003_97g-970423.pat: + + * regexp.h: ROPT_KANJI と ROPT_IMPLICIT が重なってた. + + * regcomp.h: ANY, SANY, ANYOF を simple[] へ移動. [] + が速くなった(はず). + +Wed Apr 21 12:23:56 1997 WATANABE Hirofumi + + * jperl5.003_97g-970421.pat: + + * perl5.003_97g 対応. + +-------------- + +jperl5.003_97e-970417.pat: + + * perl5.003_97e 対応. + +jperl5.003_97d-970415.pat: + + * perl5.003_97d 対応. + +jperl5.003_95-970331.pat: + + * perl5.003_93, 94, 95 対応. + + * /[\W]/, /[\S]/ がまったく逆の意味になっていたので修正. + 誰も使ってないのかな? わたしも使ったことがない. :-) + make test で発覚. + regcomp.c は大幅に書き換えた. + + * #! のラインに -Lsjis とかあると駄目だったので strncasecmp + を使おうと思ったけど util.c の ibcmp() を使った. + + * -w option をつけると tr/// が "Use of uninitialized value" + という warning になっていたので修正. t/lib/db-*.t になぜか + -w がついてて鬱陶しくなった. + + * toke.c の無意味なインデントの変更はやめた. + +jperl5.003_05.patch7_1: + + * jperl4.036 とマージした. diff -urN perl5.004_02/Configure jperl5.004_02/Configure --- perl5.004_02/Configure Thu Aug 7 23:08:44 1997 +++ jperl5.004_02/Configure Fri Aug 8 10:07:23 1997 @@ -630,6 +630,7 @@ usrinc='' defvoidused='' voidflags='' +d_euc='' CONFIG='' define='define' @@ -2490,6 +2491,19 @@ test 0 -eq "$subversion" || $echo $n " subversion $subversion" $c echo ".)" +if $test -r ../kanji.h; then + echo "You have Japanized version."; + dflt=y + rp="Do you want to use EUC as the kanji code(y=EUC, n=SJIS)?" + . ./myread + case "$ans" in + [yY]*) + d_euc='define';; + *) + d_euc='undef';; + esac +fi + : set the prefixup variable, to restore leading tilda escape prefixup='case "$prefixexp" in "$prefix") ;; @@ -10468,6 +10482,7 @@ xlibpth='$xlibpth' zcat='$zcat' zip='$zip' +d_euc='$d_euc' EOT : add special variables diff -urN perl5.004_02/MANIFEST jperl5.004_02/MANIFEST --- perl5.004_02/MANIFEST Tue Jul 29 09:01:17 1997 +++ jperl5.004_02/MANIFEST Fri Aug 8 10:07:23 1997 @@ -303,6 +303,8 @@ installman Perl script to install man pages for pods installperl Perl script to do "make install" dirty work interp.sym Interpreter specific symbols to hide in a struct +kanji.c Kanji handling routines. +kanji.h Header file for the above. keywords.h The keyword numbers keywords.pl Program to write keywords.h lib/AnyDBM_File.pm Perl module to emulate dbmopen @@ -356,6 +358,7 @@ lib/Getopt/Long.pm Fetch command options (GetOptions) lib/Getopt/Std.pm Fetch command options (getopt, getopts) lib/I18N/Collate.pm Routines to do strxfrm-based collation +lib/I18N/Japanese.pm Pragma to control Japanese. lib/IPC/Open2.pm Open a two-ended pipe lib/IPC/Open3.pm Open a three-ended pipe! lib/Math/BigFloat.pm An arbitrary precision floating-point arithmetic package @@ -535,6 +538,7 @@ pod/Makefile Make pods into something else pod/buildtoc generate perltoc.pod pod/checkpods.PL Tool to check for common errors in pods +pod/jperl.pod Jperl man page pod/perl.pod Top level perl man page pod/perlapio.pod IO API info pod/perlbook.pod Book info diff -urN perl5.004_02/Makefile.SH jperl5.004_02/Makefile.SH --- perl5.004_02/Makefile.SH Thu Aug 7 21:10:53 1997 +++ jperl5.004_02/Makefile.SH Fri Aug 8 10:07:23 1997 @@ -194,7 +194,7 @@ obj1 = $(mallocobj) gv$(OBJ_EXT) toke$(OBJ_EXT) perly$(OBJ_EXT) op$(OBJ_EXT) regcomp$(OBJ_EXT) dump$(OBJ_EXT) util$(OBJ_EXT) mg$(OBJ_EXT) obj2 = hv$(OBJ_EXT) av$(OBJ_EXT) run$(OBJ_EXT) pp_hot$(OBJ_EXT) sv$(OBJ_EXT) pp$(OBJ_EXT) scope$(OBJ_EXT) pp_ctl$(OBJ_EXT) pp_sys$(OBJ_EXT) -obj3 = doop$(OBJ_EXT) doio$(OBJ_EXT) regexec$(OBJ_EXT) taint$(OBJ_EXT) deb$(OBJ_EXT) universal$(OBJ_EXT) globals$(OBJ_EXT) perlio$(OBJ_EXT) +obj3 = doop$(OBJ_EXT) doio$(OBJ_EXT) regexec$(OBJ_EXT) taint$(OBJ_EXT) deb$(OBJ_EXT) universal$(OBJ_EXT) globals$(OBJ_EXT) perlio$(OBJ_EXT) kanji$(OBJ_EXT) obj = $(obj1) $(obj2) $(obj3) $(ARCHOBJS) diff -urN perl5.004_02/README.j jperl5.004_02/README.j --- perl5.004_02/README.j Thu Jan 1 09:00:00 1970 +++ jperl5.004_02/README.j Fri Aug 8 10:07:23 1997 @@ -0,0 +1,52 @@ +perl5.003に対する日本語パッチです。 +jperl4.019をperl5に移植した形になっています。 + +* 正規表現 +* chop +* split +* tr +* フォーマット +* open ( 漢字2文字目の|を認識) + +が日本語化されています。逆に日本語化されていないのは, + +* reverse +* index, rindex + +スイッチとして-bが追加されています。-bを付けると,日本語機能が +offになります。 + +新しいモジュールとして, +I18N::Japaneseが追加されています。 + +use I18N::Japanese qw(re tr format); + + あるいは + +no I18N::Japanese qw(re tr); + +のように使います。 + +起動ファイル名がjperlの場合には, +自動的に use I18N::Japanese qw(re tr format string); +が起動時に行われます。 + +バグ : use I18N::Japanese qw(format)指定はブロックスコープを持ちません。 +要するに,プログラムテキスト中,最後に行ったformat指定がプログラム全体を +通して有効になります。 + +日本語コードは一応EUCとMS漢字をサポート。 +MS漢字はちょっとあやしい。 + +Wed Jan 31 12:11:49 1996 +斎藤靖 +yasushi@cs.washington.edu +http://cs.washington.edu/homes/yasushi + + +Tue Aug 20 18:15:00 1996 Yasushi Saito + + * ^.が日本語にマッチしないバグ修正。渡辺博文@ソニーさん。 + 5.003対応。 + + diff -urN perl5.004_02/README.jp jperl5.004_02/README.jp --- perl5.004_02/README.jp Thu Jan 1 09:00:00 1970 +++ jperl5.004_02/README.jp Fri Aug 8 10:08:06 1997 @@ -0,0 +1,61 @@ +1. はじめに + + perl5.004_02 に対する日本語パッチです. + jperl5.003.patch7 と jperl 4.036 をミックスした形で + 移植してます. + + 斎藤さんの変更は README.j をお読みください. + + t/kanji/*.t は chmod +x し. sjis で make するときは + t/kanji/* は sjis に変換しといてください. + +2. warning message について + + |perl: warning: Setting locale failed. + |perl: warning: Please check that your locale settings: + | LC_ALL = (unset), + | LANG = "ja_JP.ujis" + | are supported and installed on your system. + |perl: warning: Falling back to the standard locale ("C"). + + という warning message は環境変数 PERL_BADLANG を設定する + ことで抑制することができます. + csh 系: setenv PERL_BADLANG 0 + sh 系: PERL_BADLANG=0; export PERL_BADLANG + もしくは ./Configure -U d_setlocale してから make しましょう. + +3. jcode.pl を動かす + + jcode.pl の先頭に no I18N::Japanese; を入れます. + 後は script の中で適宜 use, no を使い分けてください. + 基本的に jcode 内の subroutine を call する前に + no I18N::Japanese; して jperl 特有の機能を使う前に + use I18N::Japanese; します. + + +4. jperl4.036 からの機能. + + 文字種のオプション + + -Lsjis 日本語 Shift-JIS + -Leuc 日本語 EUC + -Ltca 中国語(台湾) TCA + -Lkseuc 韓国語EUC(KS C5601-1987) + -Llatin 1バイト文字のみ + + の 5 通りに切り替わります.jperl という名前で起動された場合 + はコンパイル時に指定した文字種になります(でも Configure では + euc か sjis しか選べない. もうしわけない). + + jperl5.003.patch7 では full path で表わしたときにディレクト + リに jperl という文字が含まれていたら日本語機能が ON になっ + ていたので basename に jperl が含まれているときだけ ON にな + るようにしました(そうしないと jperl という名前のディレクトリ + で make するとデバッグできん :-) これは主に DOS での話). つ + いでに sjisperl, eucperl なんて名前で起動されればその文字種 + になるようにしました. + +Fri Aug 8 10:08:04 JST 1997 +渡辺博文 +watanabe@ase.ptg.sony.so.jp +$CPAN/authors/id/WATANABE diff -urN perl5.004_02/config_h.SH jperl5.004_02/config_h.SH --- perl5.004_02/config_h.SH Fri May 9 01:57:47 1997 +++ jperl5.004_02/config_h.SH Fri Aug 8 10:07:23 1997 @@ -123,6 +123,13 @@ #define CPPSTDIN "$cppstdin" #define CPPMINUS "$cppminus" +/* EUC + * It defined, the perl handles Japanese EUC characters. + * Otherwise, it handles Shift JIS(aka MS Kanji) characters. + */ +#$d_euc EUC /**/ + + /* HAS_ALARM: * This symbol, if defined, indicates that the alarm routine is * available. diff -urN perl5.004_02/doio.c jperl5.004_02/doio.c --- perl5.004_02/doio.c Tue Jul 29 10:08:11 1997 +++ jperl5.004_02/doio.c Fri Aug 8 10:07:23 1997 @@ -252,7 +252,8 @@ else fp = PerlIO_open(name,mode); } - else if (name[len-1] == '|') { + else if (name[len-1] == '|' + && kpart(name, name+len-1) == KPART_OTHER) { name[--len] = '\0'; while (len && isSPACE(name[len-1])) name[--len] = '\0'; diff -urN perl5.004_02/doop.c jperl5.004_02/doop.c --- perl5.004_02/doop.c Thu Apr 24 00:22:20 1997 +++ jperl5.004_02/doop.c Fri Aug 8 10:07:23 1997 @@ -18,23 +18,55 @@ #include #endif +/* Cat 1 or 2-byte char into SV */ +static I32 +sv_catkanji(sv, tch) +SV *sv; +U32 tch; +{ + char xx; + /* horendously inefficent. I'll fix it later. hold on + -yasushi */ + if (tch > 255) { + xx = tch >> 8; + sv_catpvn(sv, &xx, 1); + xx = tch; + sv_catpvn(sv, &xx, 1); + return 2; + } else { + xx = tch; + sv_catpvn(sv, &xx, 1); + return 1; + } +} + + I32 do_trans(sv,arg) SV *sv; OP *arg; { - register short *tbl; - register U8 *s; - register U8 *send; - register U8 *d; - register I32 ch; + register U16 *tbl; + register U8 *s, *next_s; register I32 matches = 0; + register I32 ch; + register U8 *send; + STRLEN dlen = 0; + U32 last_rch; + SV *dest_sv = sv_newmortal(); + register I32 squash = op->op_private & OPpTRANS_SQUASH; + I32 delete = op->op_private & OPpTRANS_DELETE; + I32 complement = op->op_private & OPpTRANS_COMPLEMENT; + I32 kanji = op->op_private & OPpTRANS_KANJI; + STRLEN len; + sv_setpvn(dest_sv, "", 0); + if (SvREADONLY(sv)) croak(no_modify); - tbl = (short*)cPVOP->op_pv; + tbl = (U16 *)cPVOP->op_pv; s = (U8*)SvPV(sv, len); if (!len) return 0; @@ -45,37 +77,61 @@ if (!tbl || !s) croak("panic: do_trans"); DEBUG_t( deb("2.TBL\n")); - if (!op->op_private) { - while (s < send) { - if ((ch = tbl[*s]) >= 0) { - matches++; - *s = ch; + + while (s < send) { + U32 tch, rch; + U8 *nest_s; + U16 *tp; + int matched; + + if (kanji && iskanji(*s) && s < send-1) { + tch = twochar_to_int(*s, *(s+1)); + next_s = s+2; + } else { + tch = *(U8*)s; + next_s = s+1; + } + + /* look for ch in tbl */ + if (!complement) { + for (tp = tbl; *tp != (U16)(-1); tp += 2) { + if (*tp == tch) break; + } + matched = (*tp != (U16)(-1)); + rch = tp[1]; + } else { + for (tp = tbl; *tp != (U16)(-1); tp += 2) { + if (*tp == tch) break; } - s++; + matched = (*tp == (U16)(-1)); + if (matched && tbl != tp && tp[-1] != (U16)-1) + tch = tp[-1]; + rch = (U16)(delete ? -2 : -1); } - } - else { - d = s; - while (s < send) { - if ((ch = tbl[*s]) >= 0) { - *d = ch; - if (matches++ && squash) { - if (d[-1] == *d) - matches--; - else - d++; + + + if (!matched) { + dlen += sv_catkanji(dest_sv, tch); + } else { + if (rch == (U16)(-2)) { + /* delete this character */; + } else if (squash) { + if (last_rch == (rch==(U16)(-1)?tch:rch)) { + ; + } else { + dlen += sv_catkanji(dest_sv, rch == (U16)(-1) ? tch : rch); + matches++; } - else - d++; + } else { + dlen += sv_catkanji(dest_sv, rch==(U16)(-1) ? tch : rch); + matches++; } - else if (ch == -1) /* -1 is unmapped character */ - *d++ = *s; /* -2 is delete character */ - s++; - } - matches += send - d; /* account for disappeared chars */ - *d = '\0'; - SvCUR_set(sv, d - (U8*)SvPVX(sv)); + } + last_rch = ((rch==(U16)(-1)||rch==(U16)(-2)) ? tch : rch); + s = next_s; } + matches += (s-(U8*)SvPVX(sv)) - dlen; /* account for disappeared chars */ + sv_setpvn(sv, SvPVX(dest_sv), dlen); SvSETMAGIC(sv); return matches; } @@ -241,8 +297,15 @@ if (len && !SvPOK(sv)) s = SvPV_force(sv, len); if (s && len) { - s += --len; - sv_setpvn(astr, s, 1); + if (kpart(s, s+len-1) == KPART_KANJI_2) { + len -= 2; + s += len; + sv_setpvn(astr, s, 2); + } else { + len--; + s += len; + sv_setpvn(astr, s, 1); + } *s = '\0'; SvCUR_set(sv, len); SvNIOK_off(sv); diff -urN perl5.004_02/embed.h jperl5.004_02/embed.h --- perl5.004_02/embed.h Tue Jul 29 09:33:25 1997 +++ jperl5.004_02/embed.h Fri Aug 8 10:07:23 1997 @@ -1268,6 +1268,7 @@ #define mess_sv (curinterp->Imess_sv) #define minus_F (curinterp->Iminus_F) #define minus_a (curinterp->Iminus_a) +#define minus_b (curinterp->Iminus_b) #define minus_c (curinterp->Iminus_c) #define minus_l (curinterp->Iminus_l) #define minus_n (curinterp->Iminus_n) @@ -1422,6 +1423,7 @@ #define Imess_sv mess_sv #define Iminus_F minus_F #define Iminus_a minus_a +#define Iminus_b minus_b #define Iminus_c minus_c #define Iminus_l minus_l #define Iminus_n minus_n @@ -1585,6 +1587,7 @@ #define mess_sv Perl_mess_sv #define minus_F Perl_minus_F #define minus_a Perl_minus_a +#define minus_b Perl_minus_b #define minus_c Perl_minus_c #define minus_l Perl_minus_l #define minus_n Perl_minus_n diff -urN perl5.004_02/form.h jperl5.004_02/form.h --- perl5.004_02/form.h Fri Mar 7 00:46:34 1997 +++ jperl5.004_02/form.h Fri Aug 8 10:07:23 1997 @@ -23,4 +23,5 @@ #define FF_NEWLINE 13 #define FF_BLANK 14 #define FF_MORE 15 +#define FF_OPTION 16 /* c : hints value */ diff -urN perl5.004_02/jcode.pl-2.3.pat jperl5.004_02/jcode.pl-2.3.pat --- perl5.004_02/jcode.pl-2.3.pat Thu Jan 1 09:00:00 1970 +++ jperl5.004_02/jcode.pl-2.3.pat Fri Aug 8 10:07:23 1997 @@ -0,0 +1,5 @@ +--- jcode.pl-2.3 Tue Feb 25 14:09:15 1997 ++++ jcode.pl Thu Jul 10 19:56:31 1997 +@@ -1 +1,2 @@ ++no I18N::Japanese; + package jcode; diff -urN perl5.004_02/kanji.c jperl5.004_02/kanji.c --- perl5.004_02/kanji.c Thu Jan 1 09:00:00 1970 +++ jperl5.004_02/kanji.c Fri Aug 8 10:07:23 1997 @@ -0,0 +1,401 @@ +/*kanji.c: kanji handling routines + *$Id$ + */ + +#include "EXTERN.h" +#include "perl.h" + +unsigned char langtype_euc_tab[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 3x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 4x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 5x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 6x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 7x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, /* 8x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 9x */ + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Ax */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Bx */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Cx */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Dx */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Ex */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, /* Fx */ +}; +unsigned char langtype_sjis_tab[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 3x */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 4x */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 5x */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 6x */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, /* 7x */ + 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 8x */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 9x */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* Ax */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* Bx */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* Cx */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* Dx */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Ex */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, /* Fx */ +}; +unsigned char langtype_ks_euc_tab[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 3x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 4x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 5x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 6x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 7x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 8x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 9x */ + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Ax */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Bx */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Cx */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Dx */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Ex */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, /* Fx */ +}; +unsigned char langtype_tca_tab[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, /* 3x */ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 4x */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, /* 5x */ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 6x */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, /* 7x */ + 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 8x */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 9x */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Ax */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Bx */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Cx */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Dx */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* Ex */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, /* Fx */ +}; +unsigned char langtype_latin_tab[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 3x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 4x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 5x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 6x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 7x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 8x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 9x */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Ax */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Bx */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Cx */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Dx */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Ex */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Fx */ +}; +struct _langtypetable langtypetable[]= +{ + {langtype_sjis_tab, "SJIS"}, + {langtype_euc_tab, "EUC"}, + {langtype_ks_euc_tab, "KSEUC"}, + {langtype_tca_tab, "TCA"}, + {langtype_latin_tab, "LATIN"}, + {0, 0}, +}; +#ifdef EUC +char *langtypesymbol = "EUC"; +unsigned char *langtype = langtype_euc_tab; +#else +#ifdef KSEUC +char *langtypesymbol = "KSEUC"; +unsigned char *langtype = langtype_kseuc_tab; +#else +#ifdef TCA +char *langtypesymbol = "TCA"; +unsigned char *langtype = langtype_tca_tab; +#else +#ifdef LATIN +char *langtypesymbol = "LATIN"; +unsigned char *langtype = langtype_latin_tab; +#else +char *langtypesymbol = "SJIS"; +unsigned char *langtype = langtype_sjis_tab; +#endif +#endif +#endif +#endif + +#define IsLower(c) ('a' <= (unsigned char)(c) && (unsigned char)(c) <= 'z') +#define IsUpper(c) ('A' <= (unsigned char)(c) && (unsigned char)(c) <= 'Z') +#define ToLower(c) ((c) - 'A' + 'a') +#define ToUpper(c) ((c) - 'a' + 'A') + +int +set_lang_type(symbol) +char *symbol; +{ + int i; + + if (!symbol || !*symbol) + return 0; + for (i = 0; langtypetable[i].symbol; i++) { + if (jstricmp(langtypetable[i].symbol, symbol) == 0) { + langtype = langtypetable[i].table; + langtypesymbol = langtypetable[i].symbol; + return 0; + } + } + return -1; +} + +/* + Combine two chars C1, C2 into single 16-bit integer + */ +I32 twochar_to_int(c1, c2) +int c1, c2; +{ + return (((unsigned char)c1)<<8 | ((unsigned char)c2)); +} +I32 int_to_1stbyte(ch) +I32 ch; +{ + return (ch >> 8); +} +I32 int_to_2ndbyte(ch) +I32 ch; +{ + return ch & 0xFF; +} + +/* kpart(pLim, pChr); + * char * pLim; Buffer Top or Limit for scanning + * char * pChr; Pointer to the char + * return whith 1 : *pChr is First byte of MS-Kanji + * return whith 2 : *pChr is Second byte of MS-Kanji + * return whith 0 : otherwize + */ +int +kpart(pLim, pChr) +char *pLim; +char *pChr; +{ + register char *p = pChr - 1; + register int ct = 0; + + while (p >= pLim && iskanji(*p)) { + p--; + ct++; + } + return (ct & 1) ? 2 : iskanji(*pChr); +} + +/* jstrlen: return the number of charctors in string. + */ +int +jstrlen(s) +char *s; +{ + int len; + + for (len = 0; *s; s++, len++) { + if (iskanji(*s) && s[1]) + s++; + } + return len; +} +/* jstlen: return the number of charctors in string. + */ +int +jstlen(s, len) +char *s; +int len; +{ + int l; + char *send = s + len; + + for (l = 0; s < send; s++, l++) { + if (iskanji(*s) && s[1]) + s++; + } + return l; +} + +/* jnthchar: return with the pointer to n'th charactor in string. + * return (char *)0, when jstrlen(s) < n + */ +char * +jnthchar(s, n) +char *s; +int n; +{ + if (n) { + while (--n) { + if (iskanji(*s)) + s++; + s++; + } + } + return s; +} + +/* jindex: return with the pointer to 'ch' in "string" + * return with (char *)0, when not found + */ +char * +jindex(s, c) +char *s; +int c; +{ + while (*s) { + if (UCH(*s) == UCH(c)) + return s; + + if (iskanji(*s) && s[1]) { + s++; + } + s++; + } + return (char *) 0; +} + +/* jrindex: return with the pointer to Right end 'ch' in "string" + * return with (char *)0, when not found + */ +char * +jrindex(s, c) +char *s; +int c; +{ + char *olds = (char *) 0; + + while (*s) { + if (UCH(*s) == UCH(c)) + olds = s; + + if (iskanji(*s) && s[1]) { + s++; + } + s++; + } + return olds; +} + +char * +jstrlower(s) +char *s; +{ + char *ws = s; + + while (*ws) { + if (iskanji(*ws) && ws[1]) + ws++; + else + *ws = IsUpper(*ws) ? ToLower(*ws) : *ws; + ws++; + } + return s; +} + +int +jstricmp(s1, s2) +char *s1; +char *s2; +{ + int c1, c2; + + for (;;) { + c1 = UCH(*s1++); + c2 = UCH(*s2++); + if (!iskanji(c1)) { + c1 = IsUpper(c1) ? ToLower(c1) : c1; + c2 = IsUpper(c2) ? ToLower(c2) : c2; + } + if (c1 != c2) + return c1 - c2; + if (c1 == 0) + return 0; + if (iskanji(c1)) { + c1 = UCH(*s1++); + c2 = UCH(*s2++); + if (c1 != c2) + return c1 - c2; + if (c1 == 0) + return 0; + } + } + return 0; +} + +int +jstrnicmp(s1, s2, n) +char *s1; +char *s2; +int n; +{ + int c1, c2; + + for (;;) { + if (n--) + return 0; + c1 = UCH(*s1++); + c2 = UCH(*s2++); + if (!iskanji(c1)) { + c1 = IsUpper(c1) ? ToLower(c1) : c1; + c2 = IsUpper(c2) ? ToLower(c2) : c2; + } + if (c1 != c2) + break; + if (c1 == 0) + return 0; + if (iskanji(c1)) { + if (n--) + return 0; + c1 = UCH(*s1++); + c2 = UCH(*s2++); + if (c1 != c2) + break; + if (c1 == 0) + return 0; + } + } + return c1 - c2; +} + +unsigned int +jnextcode(cc) +unsigned int cc; +{ + unsigned int c1, c2; + + if (langtype == langtype_latin_tab) { + return cc + 1; /* ?:-P */ + } + c1 = ((cc >> 8) & 255); + c2 = (cc & 255); + c2++; + if (iskanji2(c2)) { + return (c1 << 8)|c2; + } + while (!iskanji2(c2)) { + c2++; + } + if (c2 <= 0x100) { + return (c1 << 8)|c2; + } + c2 &= 0xFF; + c1++; + if (iskanji(c1)) { + return (c1 << 8)|c2; + } + while (!iskanji(c1)) { + c1++; + } + return ((c1 & 255) << 8)|c2; +} diff -urN perl5.004_02/kanji.h jperl5.004_02/kanji.h --- perl5.004_02/kanji.h Thu Jan 1 09:00:00 1970 +++ jperl5.004_02/kanji.h Fri Aug 8 10:07:23 1997 @@ -0,0 +1,65 @@ +#ifndef _KANJI_H_INCLUDE +#ifdef iskanji +#undef iskanji +#endif + +#define UCH(c) ((unsigned char)(c)) +#define BND(l, h, c) (((l) <= (c))&&((c) <= (h))) + +#define iskanji(c) (!minus_b && (langtype[(int)((unsigned char)(c))] & 1)) +#define iskanji2(c) (!minus_b && (langtype[(int)((unsigned char)(c))] & 2)) + +struct _langtypetable { + unsigned char *table; + char *symbol; +}; +extern char *langtypesymbol; +extern unsigned char *langtype; +extern struct _langtypetable langtypetable[]; + +/* return values of kpart */ +#define KPART_KANJI_1 1 /* kanji 1st byte */ +#define KPART_KANJI_2 2 /* kanji 2nd byte */ +#define KPART_OTHER 0 /* other (ASCII) */ + +I32 twochar_to_int _((int c1, int c2)); +I32 int_to_1stbyte _((I32 c1)); +I32 int_to_2ndbyte _((I32 c1)); + +#if defined(__STDC__) || defined(MSDOS) +int set_lang_type(char *symbol); +int kpart(char *pLim,char *pChr); +int jstrlen(char *s); +int jstlen(char *s, int len); +char *jnthchar(char *s,int n); +char *jindex(char *s,int c); +char *jrindex(char *s,int c); +char *jstrlower(char *s); +int jstricmp(char *s1, char *s2); +int jstrnicmp(char *s1, char *s2, int n); +#else +int set_lang_type(); +int kpart(); +int jstrlen(); +int jstlen(); +char *jnthchar(); +char *jindex(); +char *jrindex(); +char *jstrlower(); +int jstricmp(); +int jstrnicmp(); +#endif +#ifdef strlwr +#undef strlwr +#endif +#define strlwr jstrlower +#ifdef index +#undef index +#endif +#define index jindex +#ifdef rindex +#undef rindex +#endif +#define rindex jrindex +#define _KANJI_H_INCLUDE 1 +#endif /* _KANJI_H_INCLUDE */ diff -urN perl5.004_02/lib/I18N/Japanese.pm jperl5.004_02/lib/I18N/Japanese.pm --- perl5.004_02/lib/I18N/Japanese.pm Thu Jan 1 09:00:00 1970 +++ jperl5.004_02/lib/I18N/Japanese.pm Fri Aug 8 10:07:23 1997 @@ -0,0 +1,71 @@ +package I18N::Japanese; + +=head1 NAME + +Japanese - Perl pragma to control whether some built-in operations understand + composite characters or not. + +=head1 SYNOPSIS + +use Japanese qw(re tr format); +nouse Japanese qw(re tr format); + +=head1 DESCRIPTION + +This module is actually a pragma module, though it begins with capital J. +Import/unimport takes list of strings. + +=over 5 + +=re + + Enables Japanized regexp, including split. + +=tr + + Enables Japanized tr//. + +=format + + Enables Japanized format. + +=back + + As usual, omitting the arguments means specifying all the bits. + The setting is valid only within the current block. + +=cut + +# ウラワザ : 0x8000 は,文字列定数で2バイト文字を認識するかどうか +sub import { + shift; + my $bits = 0; + @_ = qw(re tr format string) if (@_ == 0); + for (@_) { + $bits |= 0x1000 if $_ eq 're'; + $bits |= 0x2000 if $_ eq 'tr'; + $bits |= 0x4000 if $_ eq 'format'; + $bits |= 0x8000 if $_ eq 'string'; + } +# $bits |= 0x8000 if ($bits != 0); + + $^H |= $bits; + #print "imp : hints = $bits\n"; +} + +sub unimport { + shift; + my $bits = 0; + @_ = qw(re tr format string) if (@_ == 0); + for (@_) { + $bits |= 0x1000 if $_ eq 're'; + $bits |= 0x2000 if $_ eq 'tr'; + $bits |= 0x4000 if $_ eq 'format'; + $bits |= 0x8000 if $_ eq 'string'; + } +# $bits = 0x8000 if ($^H != 0x8000 && $bits == 0); + $^H &= ~$bits; + #print "unimp : hints = $bits\n"; +} + +1; diff -urN perl5.004_02/op.c jperl5.004_02/op.c --- perl5.004_02/op.c Tue Aug 5 22:26:00 1997 +++ jperl5.004_02/op.c Fri Aug 8 10:07:23 1997 @@ -1927,57 +1927,74 @@ SV *rstr = ((SVOP*)repl)->op_sv; STRLEN tlen; STRLEN rlen; - register U8 *t = (U8*)SvPV(tstr, tlen); - register U8 *r = (U8*)SvPV(rstr, rlen); - register I32 i; - register I32 j; + register char *t = SvPV(tstr, tlen); + register char *r = SvPV(rstr, rlen); + register I32 i; /* indexes t */ + register I32 j; /* indexes j */ + register I32 k; /* indexes tbl */ + I32 lasttch = -3; + I32 lastrch = -1; + I32 tbl_size = 256; + I32 delete; I32 complement; - register short *tbl; + I32 kanji; + /* the even index holds the t-char(in 2byte), and the odd index + holds the r-char(in 2 byte) if t-char is to be removed, then + r-char is -2. */ + register U16 *tbl; - tbl = (short*)cPVOP->op_pv; + New(803, tbl, tbl_size, U16); + + complement = op->op_private & OPpTRANS_COMPLEMENT; delete = op->op_private & OPpTRANS_DELETE; + kanji = op->op_private & OPpTRANS_KANJI; /* squash = op->op_private & OPpTRANS_SQUASH; */ - if (complement) { - Zero(tbl, 256, short); - for (i = 0; i < tlen; i++) - tbl[t[i]] = -1; - for (i = 0, j = 0; i < 256; i++) { - if (!tbl[i]) { - if (j >= rlen) { - if (delete) - tbl[i] = -2; - else if (rlen) - tbl[i] = r[j-1]; - else - tbl[i] = i; - } - else - tbl[i] = r[j++]; + for (i = 0, j = 0, k = 0; i < tlen || j < rlen; ) { + U32 tch, rch; + if (i >= tlen) { + tch = lasttch; + } else { + if (kanji && iskanji(t[i]) && i < tlen-1) { + tch = twochar_to_int(t[i], t[i+1]); + i+=2; + } else { + tch = (unsigned char)t[i]; + i++; } + lasttch = tch; } - } - else { - if (!rlen && !delete) { - r = t; rlen = tlen; - } - for (i = 0; i < 256; i++) - tbl[i] = -1; - for (i = 0, j = 0; i < tlen; i++,j++) { - if (j >= rlen) { - if (delete) { - if (tbl[t[i]] == -1) - tbl[t[i]] = -2; - continue; - } - --j; + if (j >= rlen) { + if (delete) rch = -2; + else rch = lastrch; + } else { + if (kanji && iskanji(r[j]) && j < rlen-1) { + rch = twochar_to_int(r[j], r[j+1]); + j += 2; + } else { + rch = (unsigned char)r[j]; + j++; } - if (tbl[t[i]] == -1) - tbl[t[i]] = r[j]; + lastrch = rch; } + if (k >= tbl_size) { + tbl_size += 256; + Renew(tbl, tbl_size, U16); + } + tbl[k++] = tch; + tbl[k++] = rch; + } + if (k >= tbl_size) { + tbl_size += 4; + Renew(tbl, tbl_size, U16); } + /* mark the end */ + tbl[k++] = -1; + tbl[k++] = -1; + cPVOP->op_pv = (char*)tbl; + op_free(expr); op_free(repl); diff -urN perl5.004_02/op.h jperl5.004_02/op.h --- perl5.004_02/op.h Wed May 14 03:59:40 1997 +++ jperl5.004_02/op.h Fri Aug 8 10:07:23 1997 @@ -96,6 +96,7 @@ #define OPpTRANS_SQUASH 16 #define OPpTRANS_DELETE 32 #define OPpTRANS_COMPLEMENT 64 +#define OPpTRANS_KANJI 128 /* Private for OP_REPEAT */ #define OPpREPEAT_DOLIST 64 /* List replication. */ diff -urN perl5.004_02/perl.c jperl5.004_02/perl.c --- perl5.004_02/perl.c Tue Aug 5 22:38:36 1997 +++ jperl5.004_02/perl.c Fri Aug 8 10:07:42 1997 @@ -453,6 +453,8 @@ Safefree(sv_interp); } +int default_hints; + int perl_parse(sv_interp, xsinit, argc, argv, env) PerlInterpreter *sv_interp; @@ -506,6 +508,29 @@ return 0; } + s = rindex(origargv[0], '/'); + if (!s) + s = origargv[0]; + if (instr(s, "jperl")) + s = ""; + else if (instr(s, "kseucperl")) + s = "KSEUC"; + else if (instr(s, "tcaperl")) + s = "TCA"; + else if (instr(s, "eucperl")) + s = "EUC"; + else if (instr(s, "sjisperl")) + s = "SJIS"; + else if (instr(s, "latinperl")) + s = "LATIN"; + else + s = 0; + if (s) { + default_hints = hints = HINT_KANJI_REGEXP | HINT_KANJI_TR + | HINT_KANJI_FORMAT | HINT_KANJI_STRING; + set_lang_type(s); + } + if (main_root) { curpad = AvARRAY(comppad); op_free(main_root); @@ -559,6 +584,7 @@ case '0': case 'F': case 'a': + case 'b': case 'c': case 'd': case 'D': @@ -574,6 +600,7 @@ case 'U': case 'v': case 'w': + case 'L': if (s = moreswitches(s)) goto reswitch; break; @@ -1345,6 +1372,14 @@ minus_a = TRUE; s++; return s; + case 'b': + /* No kanji support */ + minus_b = TRUE; + default_hints = hints &= ~(HINT_KANJI_REGEXP | HINT_KANJI_TR + | HINT_KANJI_FORMAT | HINT_KANJI_STRING); + set_lang_type("LATIN"); + s++; + return s; case 'c': minus_c = TRUE; s++; @@ -1497,6 +1532,11 @@ #endif printf("\n\nCopyright 1987-1997, Larry Wall\n"); + printf("\nJapanization patch 4 by Yasushi Saito, 1996\n"); + printf("\nModified by Hirofumi Watanabe, 1996, 1997\n"); + printf("jperl5.004_02-970808\n"); + printf("%s version\n", langtypesymbol); + #ifdef MSDOS printf("\n\nMS-DOS port Copyright (c) 1989, 1990, Diomidis Spinellis\n"); #endif @@ -1518,6 +1558,32 @@ dowarn = TRUE; s++; return s; + case 'L': + { + int f, i; + int len; + char *lang; + s++; + for (f = i = 0; langtypetable[i].symbol; i++) { + len = strlen(langtypetable[i].symbol); + if (ibcmp(s, langtypetable[i].symbol, len) == 0) { + set_lang_type(s); + f = 1; + default_hints = hints = + HINT_KANJI_REGEXP | + HINT_KANJI_TR | + HINT_KANJI_FORMAT | + HINT_KANJI_STRING; + break; + } + } + if (!f) + croak("Unknown language type '%s'", s); + minus_b = FALSE; + while (*s) + s++; + return s; + } case '*': case ' ': if (s[1] == '-') /* Additional switches on #! line. */ diff -urN perl5.004_02/perl.h jperl5.004_02/perl.h --- perl5.004_02/perl.h Tue Aug 5 23:51:17 1997 +++ jperl5.004_02/perl.h Fri Aug 8 10:07:23 1997 @@ -1661,6 +1661,11 @@ #define HINT_STRICT_VARS 0x00000400 #define HINT_LOCALE 0x00000800 +#define HINT_KANJI_REGEXP 0x00001000 +#define HINT_KANJI_TR 0x00002000 +#define HINT_KANJI_FORMAT 0x00004000 +#define HINT_KANJI_STRING 0x00008000 + /**************************************************************************/ /* This regexp stuff is global since it always happens within 1 expr eval */ /**************************************************************************/ @@ -1728,6 +1733,7 @@ IEXT bool Iminus_p; IEXT bool Iminus_l; IEXT bool Iminus_a; +IEXT bool Iminus_b; IEXT bool Iminus_F; IEXT bool Idoswitches; IEXT bool Idowarn; @@ -2027,6 +2033,21 @@ EXT MGVTBL vtbl_amagic; EXT MGVTBL vtbl_amagicelem; #endif /* OVERLOAD */ + +#include "kanji.h" +#undef isalpha +#undef isascii +#undef isdigit +#undef islower +#undef isspace +#undef isupper +#define isalpha(c) (BND('A', 'Z', UCH(c))||BND('a', 'z', UCH(c))) +#define isascii(c) (UCH(c) <= 0x7F) +#define isdigit(c) BND('0', '9', UCH(c)) +#define islower(c) BND('a', 'z', UCH(c)) +#define isspace(c) (BND(9, 13, UCH(c)) || (c) == ' ') +#define isupper(c) BND('A', 'Z', UCH(c)) + #endif /* !DOINIT */ diff -urN perl5.004_02/pod/jperl.pod jperl5.004_02/pod/jperl.pod --- perl5.004_02/pod/jperl.pod Thu Jan 1 09:00:00 1970 +++ jperl5.004_02/pod/jperl.pod Fri Aug 8 10:07:23 1997 @@ -0,0 +1,114 @@ +=head1 NAME + +jperl - Japanized Perl. + +=head1 DESCRIPTION + +Jperl is a variation of L that recognizes 16-bit Japanese characters. +It supports Japanese EUC and Shift JIS(aka MS Kanji code). + +Currently, it understands Japanese in the following circumstances. + +=over 5 + +=item * Regular Expression + + '.' matches either 1-byte ASCII character or 2-byte Japanese character. +Character class notation('[X-Y]', '[^X-Y]') recognizes a +Japanese letter as a character. + +=item * tr + +Similarly, tr recognizes a Japanese letter as a letter. + +=item * split + +Similar. + +=item * formats + +It doesn't cut 2-byte letter at the middle when linebreaking. + +=item * chop + +It understands 2-byte letters. + +=back + +=head1 ADDITIONAL COMMAND LINE ARGUMENT + +=over 5 + +=item B<-b> + +turns off Japanese mode. This is supported only in Japanized perl. +Without this option, most 8-bit characters are interpreted to be +composite character, and some programs that +use 8th bit for some control information(L is an example) won't +work. With B<-b>, jperl will be compatible with the original perl. + +=back + +=head1 AUTHOR + +Yasushi Saito + +yasushi@cs.washington.edu + +=head1 NAME + +jperl - Japanized Perl. + +=head1 DESCRIPTION + +Jperl is a variation of L that recognizes 16-bit Japanese characters. +It supports Japanese EUC and Shift JIS(aka MS Kanji code). + +Currently, it understands Japanese in the following circumstances. + +=over 5 + +=item * Regular Expression + + '.' matches either 1-byte ASCII character or 2-byte Japanese character. +Character class notation('[X-Y]', '[^X-Y]') recognizes a +Japanese letter as a character. + +=item * tr + +Similarly, tr recognizes a Japanese letter as a letter. + +=item * split + +Similar. + +=item * formats + +It doesn't cut 2-byte letter at the middle when linebreaking. + +=item * chop + +It understands 2-byte letters. + +=back + +=head1 ADDITIONAL COMMAND LINE ARGUMENT + +=over 5 + +=item B<-b> + +turns off Japanese mode. This is supported only in Japanized perl. +Without this option, most 8-bit characters are interpreted to be +composite character, and some programs that +use 8th bit for some control information(L is an example) won't +work. With B<-b>, jperl will be compatible with the original perl. + +=back + +=head1 AUTHOR + +Yasushi Saito + +yasushi@cs.washington.edu + diff -urN perl5.004_02/pod/perlrun.pod jperl5.004_02/pod/perlrun.pod --- perl5.004_02/pod/perlrun.pod Fri Aug 1 04:52:32 1997 +++ jperl5.004_02/pod/perlrun.pod Fri Aug 8 10:07:23 1997 @@ -207,6 +207,14 @@ An alternate delimiter may be specified using B<-F>. +=item B<-b> + +turns off Japanese mode. This is supported only in Japanized perl. +Without this option, most 8-bit characters are interpreted to be +composite character(i.e., Japanese Kanji), and some programs that +use 8th bit for some control information(L is an example) won't +work. With -b, jperl will be compatible with the original perl. + =item B<-c> causes Perl to check the syntax of the script and then exit without diff -urN perl5.004_02/pod/pod2man.PL jperl5.004_02/pod/pod2man.PL --- perl5.004_02/pod/pod2man.PL Fri Aug 1 06:03:43 1997 +++ jperl5.004_02/pod/pod2man.PL Fri Aug 8 10:07:23 1997 @@ -25,7 +25,7 @@ # You can use $Config{...} to use Configure variables. print OUT <<"!GROK!THIS!"; -$Config{startperl} +$Config{startperl} -b eval 'exec $Config{perlpath} -S \$0 \${1+"\$@"}' if \$running_under_some_shell; diff -urN perl5.004_02/pp.c jperl5.004_02/pp.c --- perl5.004_02/pp.c Tue Aug 5 22:07:43 1997 +++ jperl5.004_02/pp.c Fri Aug 8 10:07:23 1997 @@ -1996,11 +1996,17 @@ TAINT; SvTAINTED_on(sv); for (; s < send; s++) - *s = toUPPER_LC(*s); + if (hints & HINT_KANJI_STRING && iskanji(*s) && s+1 < send) + s++; + else + *s = toUPPER_LC(*s); } else { for (; s < send; s++) - *s = toUPPER(*s); + if (hints & HINT_KANJI_STRING && iskanji(*s) && s+1 < send) + s++; + else + *s = toUPPER(*s); } } RETURN; @@ -2028,11 +2034,17 @@ TAINT; SvTAINTED_on(sv); for (; s < send; s++) - *s = toLOWER_LC(*s); + if (hints & HINT_KANJI_STRING && iskanji(*s) && s+1 < send) + s++; + else + *s = toLOWER_LC(*s); } else { for (; s < send; s++) - *s = toLOWER(*s); + if (hints & HINT_KANJI_STRING && iskanji(*s) && s+1 < send) + s++; + else + *s = toLOWER(*s); } } RETURN; @@ -2051,7 +2063,10 @@ SvGROW(TARG, (len * 2) + 1); d = SvPVX(TARG); while (len--) { - if (!isALNUM(*s)) + if (hints & HINT_KANJI_STRING && iskanji(*s) && len) { + *d++ = *s++; + len--; + } else if (!isALNUM(*s)) *d++ = '\\'; *d++ = *s++; } diff -urN perl5.004_02/pp_ctl.c jperl5.004_02/pp_ctl.c --- perl5.004_02/pp_ctl.c Tue Aug 5 22:26:00 1997 +++ jperl5.004_02/pp_ctl.c Fri Aug 8 10:07:23 1997 @@ -243,6 +243,7 @@ I32 itemsize; I32 fieldsize; I32 lines = 0; + I32 kanji = (hints & HINT_KANJI_FORMAT); bool chopspace = (strchr(chopset, ' ') != Nullch); char *chophere; char *linemark; @@ -328,6 +329,11 @@ itemsize = len; if (itemsize > fieldsize) itemsize = fieldsize; + if (kanji + && kpart(item, item+itemsize) == KPART_KANJI_2) { + /* don't split kanji at the middle. */ + itemsize--; + } send = chophere = s + itemsize; while (s < send) { if (*s & ~31) @@ -355,6 +361,11 @@ } else { itemsize = fieldsize; + if (kanji + && kpart(item, item+itemsize) == KPART_KANJI_2) { + /* don't split kanji at the middle. */ + itemsize--; + } send = chophere = s + itemsize; while (s < send || (s == send && isSPACE(*s))) { if (isSPACE(*s)) { @@ -2231,6 +2242,7 @@ PP(pp_require) { + extern int default_hints; dSP; register CONTEXT *cx; SV *sv; @@ -2352,7 +2364,7 @@ name = savepv(name); SAVEFREEPV(name); SAVEI32(hints); - hints = 0; + hints = default_hints; /* switch to eval mode */ @@ -2603,6 +2615,7 @@ U16 *linepc; register I32 arg; bool ischop; + I32 kanji = (hints & HINT_KANJI_FORMAT); if (len == 0) croak("Null picture in formline"); @@ -2618,6 +2631,12 @@ } while (s <= send) { + if (kanji && iskanji(*s)) { + s += 2; + skipspaces = 0; + continue; + } + switch (*s++) { default: skipspaces = 0; diff -urN perl5.004_02/pp_sys.c jperl5.004_02/pp_sys.c --- perl5.004_02/pp_sys.c Thu Aug 7 23:42:07 1997 +++ jperl5.004_02/pp_sys.c Fri Aug 8 10:07:23 1997 @@ -2533,8 +2533,9 @@ if (!*s) { /* null never allowed in text */ odd += len; break; - } - else if (*s & 128) + } else if (iskanji(*s) && s[1]) { + ; + } else if (*s & 128) odd++; else if (*s < 32 && *s != '\n' && *s != '\r' && *s != '\b' && diff -urN perl5.004_02/regcomp.c jperl5.004_02/regcomp.c --- perl5.004_02/regcomp.c Fri Aug 1 03:18:44 1997 +++ jperl5.004_02/regcomp.c Fri Aug 8 10:07:23 1997 @@ -71,6 +71,8 @@ #define STATIC static #endif +#define ISKANJI(c) ((hints & HINT_KANJI_REGEXP) && iskanji(c)) + #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s))) @@ -111,6 +113,7 @@ static void regtail _((char *, char *)); static char* regwhite _((char *, char *)); static char* nextchar _((void)); +static void regclassfree _((struct regexp *)); /* - pregcomp - compile a regular expression into internal code @@ -200,7 +203,7 @@ /* Dig out information for optimizations. */ pm->op_pmflags = regflags; r->regstart = Nullsv; /* Worst-case defaults. */ - r->reganch = 0; + r->reganch = ((hints & HINT_KANJI_REGEXP) ? ROPT_KANJI : 0); r->regmust = Nullsv; r->regback = -1; r->regstclass = Nullch; @@ -793,12 +796,12 @@ else ret = regnode(ANY); regnaughty++; - *flagp |= HASWIDTH|SIMPLE; + *flagp |= HASWIDTH; break; case '[': regparse++; ret = regclass(); - *flagp |= HASWIDTH|SIMPLE; + *flagp |= HASWIDTH; break; case '(': nextchar(); @@ -949,7 +952,7 @@ : EXACT); regc(0); /* save spot for len */ for (len = 0, p = regparse - 1; - len < 127 && p < regxend; + len < 126 && p < regxend; len++) { oldp = p; @@ -1040,7 +1043,22 @@ } if (regflags & PMf_EXTENDED) p = regwhite(p, regxend); - if (ISMULT2(p)) { /* Back off on ?+*. */ + if (ISKANJI(ender) && *p) { + if (ISMULT2((p+1))) { + if (len) { + p = oldp; + } else { + len += 2; + regc(ender); + ender = *p++; + regc(ender); + } + break; + } + len++; + regc(ender); + ender = *p++; + } else if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; else { @@ -1090,14 +1108,47 @@ } static void +block_on(b, x, y) +char *b; +int x; +int y; +{ + int lo; + int hi; + int r_lo = x & 7; + int r_hi = y & 7; + + if (!b) + return; + + if (0x8000 <= x) + x -= 0x8000 - 0x100; + if (0x8000 <= y) + y -= 0x8000 - 0x100; + + lo = x >> 3; + hi = y >> 3; + + if (lo == hi) { + b[lo] |= (1 << (r_hi+1)) - (1 << r_lo); + } else { + int i; + for (i = lo + 1; i < hi; i++) + b[i] = 0xff; + b[lo] |= (0xff << r_lo); + b[hi] |= ~(0xff << (r_hi+1)); + } +} + +static void regset(opnd, c) char *opnd; register I32 c; { - if (opnd == ®dummy) + if (!opnd) return; c &= 0xFF; - opnd[1 + (c >> 3)] |= (1 << (c & 7)); + opnd[c >> 3] |= (1 << (c & 7)); } static char * @@ -1110,11 +1161,27 @@ register char *ret; register I32 def; I32 numlen; + int ismb = 0; + char **bitvec = 0; ret = regnode(ANYOF); opnd = regcode; - for (class = 0; class < 33; class++) + regc(0); + + if (opnd != ®dummy) { + if ((long)regcode & 1) /* fix alignment */ + regc(127); + if ((long)regcode & 2) { /* and fix alignment */ + regc(127); + regc(127); + } + bitvec = (char **)regcode; + } else + regsize += 3; + + for (class = 0; class < sizeof (char *); class++) regc(0); + if (*regparse == '^') { /* Complement of range. */ regnaughty++; regparse++; @@ -1122,6 +1189,7 @@ *opnd |= ANYOF_INVERT; } if (opnd != ®dummy) { + Newz(1003, *bitvec, (0x8000 + 0x100) / 8, char); if (regflags & PMf_FOLD) *opnd |= ANYOF_FOLD; if (regflags & PMf_LOCALE) @@ -1132,67 +1200,81 @@ while (regparse < regxend && *regparse != ']') { skipcond: class = UCHARAT(regparse++); + if (ISKANJI(class) && regparse < regxend) { + if (*opnd & ANYOF_LITERAL) + FAIL("kanji and hex/octal literal"); + class = (class << 8) + UCHARAT(regparse++); + ismb = 1; + } if (class == '\\') { class = UCHARAT(regparse++); switch (class) { case 'w': - if (regflags & PMf_LOCALE) { - if (opnd != ®dummy) + if (bitvec) { + if (regflags & PMf_LOCALE) { *opnd |= ANYOF_ALNUML; - } - else { - for (class = 0; class < 256; class++) - if (isALNUM(class)) - regset(opnd, class); + } + else { + for (class = 0; class < 256; class++) + if (isALNUM(class)) + regset(*bitvec, class); + } } lastclass = 1234; continue; case 'W': - if (regflags & PMf_LOCALE) { - if (opnd != ®dummy) + if (bitvec) { + if (regflags & PMf_LOCALE) { *opnd |= ANYOF_NALNUML; - } - else { - for (class = 0; class < 256; class++) - if (!isALNUM(class)) - regset(opnd, class); + } + else { + for (class = 0; class < 256; class++) + if (!isALNUM(class)) + regset(*bitvec, class); + block_on(*bitvec, 0x8000, 0xffff); + } } lastclass = 1234; continue; case 's': - if (regflags & PMf_LOCALE) { - if (opnd != ®dummy) + if (bitvec) { + if (regflags & PMf_LOCALE) { *opnd |= ANYOF_SPACEL; - } - else { - for (class = 0; class < 256; class++) - if (isSPACE(class)) - regset(opnd, class); + } + else { + for (class = 0; class < 256; class++) + if (isSPACE(class)) + regset(*bitvec, class); + } } lastclass = 1234; continue; case 'S': - if (regflags & PMf_LOCALE) { - if (opnd != ®dummy) + if (bitvec) { + if (regflags & PMf_LOCALE) { *opnd |= ANYOF_NSPACEL; - } - else { - for (class = 0; class < 256; class++) - if (!isSPACE(class)) - regset(opnd, class); + } + else { + for (class = 0; class < 256; class++) + if (!isSPACE(class)) + regset(*bitvec, class); + block_on(*bitvec, 0x8000, 0xffff); + } } lastclass = 1234; continue; case 'd': - for (class = '0'; class <= '9'; class++) - regset(opnd, class); + if (bitvec) { + block_on(*bitvec, '0', '9'); + } lastclass = 1234; continue; case 'D': - for (class = 0; class < '0'; class++) - regset(opnd, class); - for (class = '9' + 1; class < 256; class++) - regset(opnd, class); + if (bitvec) { + block_on(*bitvec, 0, '0' - 1); + block_on(*bitvec, '9' + 1, 255); + block_on(*bitvec, 0x8000, 0xffff); + } lastclass = 1234; continue; case 'n': @@ -1219,6 +1301,11 @@ case 'x': class = scan_hex(regparse, 2, &numlen); regparse += numlen; + if (bitvec) { + if (ismb) + FAIL("kanji and hex/octal literal"); + *opnd |= ANYOF_LITERAL; + } break; case 'c': class = UCHARAT(regparse++); @@ -1228,6 +1315,11 @@ case '5': case '6': case '7': case '8': case '9': class = scan_oct(--regparse, 3, &numlen); regparse += numlen; + if (bitvec) { + if (ismb) + FAIL("kanji and hex/octal literal"); + *opnd |= ANYOF_LITERAL; + } break; } } @@ -1245,8 +1337,8 @@ continue; /* do it next time */ } } - for ( ; lastclass <= class; lastclass++) - regset(opnd, lastclass); + if (bitvec) + block_on(*bitvec, lastclass, class); lastclass = class; } if (*regparse != ']') @@ -1312,7 +1404,7 @@ #ifdef REGALIGN #ifndef lint if (!((long)ret & 1)) - *ret++ = 127; + *ret++ = 120; #endif #endif ptr = ret; @@ -1517,7 +1609,6 @@ } #ifdef DEBUGGING - /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ @@ -1548,7 +1639,16 @@ PerlIO_printf(Perl_debug_log, "(%d)", (s-r->program)+(next-s)); s += 3; if (op == ANYOF) { - s += 33; + char **bitvec; + s++; + if ((long)s & 1) /* fix alignment */ + s++; + if ((long)s & 2) /* fix alignment */ + s += 2; + bitvec = (char **)s; + PerlIO_printf(Perl_debug_log, "bitvec: %08x, ", bitvec); + PerlIO_printf(Perl_debug_log, " %08x", *bitvec); + s += sizeof (char *); } if (regkind[(U8)op] == EXACT) { /* Literal string, where present. */ @@ -1771,5 +1871,42 @@ } Safefree(r->startp); Safefree(r->endp); + regclassfree(r); Safefree(r); +} + +static void +regclassfree(r) +struct regexp *r; +{ + register char *s; + register char op = EXACT; /* Arbitrary non-END op. */ + + + s = r->program + 1; + while (op != END) { /* While that wasn't END last time... */ +#ifdef REGALIGN + if (!((long)s & 1)) + s++; +#endif + op = OP(s); + s += regarglen[(U8)op] + 3; + if (op == ANYOF) { + char **bitvec; + s++; + if ((long)s & 1) /* fix alignment */ + s++; + if ((long)s & 2) /* fix alignment */ + s += 2; + bitvec = (char **)s; + Safefree(*bitvec); + s += sizeof (char *); + } + if (regkind[(U8)op] == EXACT) { + /* Literal string, where present. */ + s++; + while (*s++) + ; + } + } } diff -urN perl5.004_02/regcomp.h jperl5.004_02/regcomp.h --- perl5.004_02/regcomp.h Sat Jun 7 09:42:21 1997 +++ jperl5.004_02/regcomp.h Fri Aug 8 10:07:23 1997 @@ -256,6 +256,7 @@ #define MAGIC 0234 /* Flags for first parameter byte of ANYOF */ +#define ANYOF_LITERAL 0x80 #define ANYOF_INVERT 0x40 #define ANYOF_FOLD 0x20 #define ANYOF_LOCALE 0x10 diff -urN perl5.004_02/regexec.c jperl5.004_02/regexec.c --- perl5.004_02/regexec.c Tue Jul 29 06:16:23 1997 +++ jperl5.004_02/regexec.c Fri Aug 8 10:07:23 1997 @@ -58,6 +58,12 @@ #ifndef STATIC #define STATIC static #endif +#undef isascii +#define isascii(c) (0 < (c) && (c) < 0x7F) + +#define KANJI_MODE (_prog->reganch & ROPT_KANJI) +static regexp *_prog; /* save the latest regexp* here. + XXX this won't work in multi-threaded environment */ #ifdef DEBUGGING static I32 regnarrate = 0; @@ -214,6 +220,8 @@ croak("NULL regexp parameter"); return 0; } + + _prog = prog; /* save the thing */ if (startpos == strbeg) /* is ^ valid at stringarg? */ regprev = '\n'; @@ -267,6 +275,9 @@ s = startpos; minlen = SvCUR(prog->regmust); } + if (KANJI_MODE && s && kpart(startpos, s) == KPART_KANJI_2) { + s = startpos; + } } /* Mark beginning of line for ^ . */ @@ -311,9 +322,12 @@ if (*s == ch) { if (regtry(prog, s)) goto got_it; + if (KANJI_MODE && iskanji(*s) && s+1= regeol) sayNO; + if (KANJI_MODE && iskanji(nextchar) && locinput[1]) ++locinput; nextchar = UCHARAT(++locinput); break; case ANY: if (!nextchar && locinput >= regeol || nextchar == '\n') sayNO; + if (KANJI_MODE && iskanji(nextchar) && locinput[1]) ++locinput; nextchar = UCHARAT(++locinput); break; case EXACT: @@ -772,6 +808,11 @@ s = OPERAND(scan); if (nextchar < 0) nextchar = UCHARAT(locinput); + if (KANJI_MODE && !(*s & ANYOF_LITERAL) + && iskanji(nextchar) && locinput[1]) { + locinput++; + nextchar = (nextchar << 8) + UCHARAT(locinput); + } if (!reginclass(s, nextchar)) sayNO; if (!nextchar && locinput >= regeol) @@ -787,6 +828,8 @@ if (!(OP(scan) == ALNUM ? isALNUM(nextchar) : isALNUM_LC(nextchar))) sayNO; + if (KANJI_MODE && iskanji(nextchar) && locinput[1]) + ++locinput; nextchar = UCHARAT(++locinput); break; case NALNUML: @@ -798,6 +841,8 @@ if (OP(scan) == NALNUM ? isALNUM(nextchar) : isALNUM_LC(nextchar)) sayNO; + if (KANJI_MODE && iskanji(nextchar) && locinput[1]) + ++locinput; nextchar = UCHARAT(++locinput); break; case BOUNDL: @@ -839,6 +884,8 @@ if (OP(scan) == SPACE ? isSPACE(nextchar) : isSPACE_LC(nextchar)) sayNO; + if (KANJI_MODE && iskanji(nextchar) && locinput[1]) + ++locinput; nextchar = UCHARAT(++locinput); break; case DIGIT: @@ -851,6 +898,8 @@ sayNO; if (isDIGIT(nextchar)) sayNO; + if (KANJI_MODE && iskanji(nextchar) && locinput[1]) + ++locinput; nextchar = UCHARAT(++locinput); break; case REFFL: @@ -1088,13 +1137,14 @@ if (ln && regrepeat(scan, ln) < ln) sayNO; while (n >= ln || (n == 32767 && ln > 0)) { /* ln overflow ? */ - /* If it could work, try it. */ - if (c1 == -1000 || - UCHARAT(reginput) == c1 || - UCHARAT(reginput) == c2) - { - if (regmatch(next)) - sayYES; + if (!KANJI_MODE + || kpart(locinput, reginput) != KPART_KANJI_2) { + /* If it could work, try it. */ + if (c1 == -1000 || + UCHARAT(reginput) == c1 || + UCHARAT(reginput) == c2) + if (regmatch(next)) + sayYES; } /* Couldn't or didn't -- back up. */ reginput = locinput + ln; @@ -1112,13 +1162,14 @@ (!multiline || OP(next) == SEOL)) ln = n; /* why back off? */ while (n >= ln) { - /* If it could work, try it. */ - if (c1 == -1000 || - UCHARAT(reginput) == c1 || - UCHARAT(reginput) == c2) - { - if (regmatch(next)) - sayYES; + if (!KANJI_MODE + || kpart(locinput, reginput) != KPART_KANJI_2) { + /* If it could work, try it. */ + if (c1 == -1000 || + UCHARAT(reginput) == c1 || + UCHARAT(reginput) == c2) + if (regmatch(next)) + sayYES; } /* Couldn't or didn't -- back up. */ n--; @@ -1188,6 +1239,7 @@ register char *opnd; register I32 c; register char *loceol = regeol; + char *oldscan; scan = reginput; if (max != 32767 && max < loceol - scan) @@ -1195,8 +1247,10 @@ opnd = OPERAND(p); switch (OP(p)) { case ANY: - while (scan < loceol && *scan != '\n') + while (scan < loceol && *scan != '\n') { + if (KANJI_MODE && iskanji(*scan)) scan++; scan++; + } break; case SANY: scan = loceol; @@ -1220,8 +1274,23 @@ scan++; break; case ANYOF: - while (scan < loceol && reginclass(opnd, *scan)) + c = UCHARAT(scan); + oldscan = scan; + if (KANJI_MODE && !(*opnd & ANYOF_LITERAL) + && iskanji(c) && scan < loceol) { scan++; + c = (c << 8) + UCHARAT(scan); + } + while (scan < loceol && reginclass(opnd, c)) { + scan++; + c = UCHARAT(scan); + oldscan = scan; + if (KANJI_MODE && iskanji(c) && scan < loceol) { + scan++; + c = (c << 8) + UCHARAT(scan); + } + } + scan = oldscan; break; case ALNUM: while (scan < loceol && isALNUM(*scan)) @@ -1233,8 +1302,10 @@ scan++; break; case NALNUM: - while (scan < loceol && !isALNUM(*scan)) + while (scan < loceol && !isALNUM(*scan)) { + if (KANJI_MODE && iskanji(*scan)) scan++; scan++; + } break; case NALNUML: regtainted = TRUE; @@ -1251,6 +1322,10 @@ scan++; break; case NSPACE: + while (scan < loceol && !isSPACE(*scan)) { + if (KANJI_MODE && iskanji(*scan)) scan++; + scan++; + } while (scan < loceol && !isSPACE(*scan)) scan++; break; @@ -1264,8 +1339,10 @@ scan++; break; case NDIGIT: - while (scan < loceol && !isDIGIT(*scan)) + while (scan < loceol && !isDIGIT(*scan)) { + if (KANJI_MODE && iskanji(*scan)) scan++; scan++; + } break; default: /* Called on something of 0 width. */ break; /* So match right here or not at all. */ @@ -1288,10 +1365,22 @@ { char flags = *p; bool match = FALSE; + char *bitvec; - c &= 0xFF; - if (p[1 + (c >> 3)] & (1 << (c & 7))) + p++; + if ((long)p & 1) /* fix alignment */ + p++; + if ((long)p & 2) /* fix alignment */ + p += 2; + bitvec = *(char **)p; + + c &= 0xFFFF; + if (0x8000 <= c) + c -= 0x8000 - 0x100; + if (bitvec[c >> 3] & (1 << (c & 7))) match = TRUE; + else if (0x100 <= c) + goto end; else if (flags & ANYOF_FOLD) { I32 cf; if (flags & ANYOF_LOCALE) { @@ -1300,7 +1389,7 @@ } else cf = fold[c]; - if (p[1 + (cf >> 3)] & (1 << (cf & 7))) + if (bitvec[cf >> 3] & (1 << (cf & 7))) match = TRUE; } @@ -1315,7 +1404,8 @@ match = TRUE; } } - +end: + ; return match ^ ((flags & ANYOF_INVERT) != 0); } diff -urN perl5.004_02/regexp.h jperl5.004_02/regexp.h --- perl5.004_02/regexp.h Sat Jan 18 03:05:21 1997 +++ jperl5.004_02/regexp.h Fri Aug 8 10:07:23 1997 @@ -35,3 +35,4 @@ #define ROPT_ANCH_GPOS 2 #define ROPT_SKIP 4 #define ROPT_IMPLICIT 8 +#define ROPT_KANJI 16 diff -urN perl5.004_02/t/TEST jperl5.004_02/t/TEST --- perl5.004_02/t/TEST Sat Aug 2 00:25:01 1997 +++ jperl5.004_02/t/TEST Fri Aug 8 10:07:23 1997 @@ -21,7 +21,7 @@ if ($#ARGV == -1) { @ARGV = split(/[ \n]/, - `echo base/*.t comp/*.t cmd/*.t io/*.t; echo op/*.t pragma/*.t lib/*.t`); + `echo base/*.t comp/*.t cmd/*.t io/*.t; echo op/*.t pragma/*.t lib/*.t kanji/*.t`); } if ($^O eq 'os2' || $^O eq 'qnx') { diff -urN perl5.004_02/t/harness jperl5.004_02/t/harness --- perl5.004_02/t/harness Fri Mar 21 11:50:32 1997 +++ jperl5.004_02/t/harness Fri Aug 8 10:07:23 1997 @@ -15,5 +15,5 @@ $Test::Harness::verbose = shift if @ARGV && $ARGV[0] eq '-v'; @tests = @ARGV; -@tests = unless @tests; +@tests = unless @tests; Test::Harness::runtests @tests; diff -urN perl5.004_02/t/kanji/format.t jperl5.004_02/t/kanji/format.t --- perl5.004_02/t/kanji/format.t Thu Jan 1 09:00:00 1970 +++ jperl5.004_02/t/kanji/format.t Fri Aug 8 10:07:23 1997 @@ -0,0 +1,23 @@ +#!./perl +BEGIN { + $^H |= 0xf000; # Turn on all japanese features. +} + + +format XXX = +^>>>> +$var +^>>>> +$var +. + +open XXX, ">xxx"; + +$var = "あいうえお"; +write XXX; +close XXX; + +print "1..1\n"; + +if (`cat xxx` eq " あい\n うえ\n") {print "ok 1\n";} else {print "not ok 1\n";} + diff -urN perl5.004_02/t/kanji/op.t jperl5.004_02/t/kanji/op.t --- perl5.004_02/t/kanji/op.t Thu Jan 1 09:00:00 1970 +++ jperl5.004_02/t/kanji/op.t Fri Aug 8 10:07:23 1997 @@ -0,0 +1,49 @@ +#!./perl +# +# chop, chomp +# +BEGIN { + $^H |= 0xf000; # Turn on all japanese features. +} + +print "1..11\n"; +$y = "あいう"; +$x = chomp($y); +if ($x ne 0 || $y ne "あいう") {print "not ok 1\n";} else {print "ok 1\n";}; + +$y = "あいう\n"; +$x = chomp($y); +if ($x ne 1 || $y ne "あいう") {print "not ok 2\n";} else {print "ok 2\n";}; + +$y = "あいう"; +$x = chop($y); +if ($x ne "う" || $y ne "あい") {print "not ok 3\n";} else {print "ok 3\n";}; + +$y = "あいうt"; +$x = chop($y); +if ($x ne "t" || $y ne "あいう") {print "not ok 4\n";} else {print "ok 4\n";}; + +# +# tr +# +$y = "あいうえお"; $y =~ tr/あ-う//cd; +if ($y ne "あいう") {print "not ok 5\n";} else {print "ok 5\n";}; +$y = "あいうえお"; $y =~ tr/あ-う/か-く/; +if ($y ne "かきくえお") {print "not ok 6\n";} else {print "ok 6\n";}; + +$y = 'abcabcabc'; $y =~ tr/a-c/d-f/; +if ($y ne 'defdefdef') {print "not ok 7\n";} else {print "ok 7\n";}; +$y = 'abc'; $y =~ tr/abc/def/; +if ($y ne 'def') {print "not ok 8\n";} else {print "ok 8\n";}; +$y = 'abcabcabc'; $y =~ tr/abc/def/; +if ($y ne 'defdefdef') {print "not ok 9\n";} else {print "ok 9\n";}; + +# +# split +# +@X = split(/(.)/, "abcde"); +if ($#X != 9) {print "not ok 10\n";} else {print "ok 10\n";}; + +@X = split(/(.)/, "あいうtえお"); +$x = $#X; +if ($x != 11) {print "not ok 11\n";} else {print "ok 11\n";}; diff -urN perl5.004_02/t/kanji/re_tests jperl5.004_02/t/kanji/re_tests --- perl5.004_02/t/kanji/re_tests Thu Jan 1 09:00:00 1970 +++ jperl5.004_02/t/kanji/re_tests Fri Aug 8 10:07:23 1997 @@ -0,0 +1,255 @@ +あいう あいう y $& あいう +あいう xいう n - - +あいう あxう n - - +あいう あいx n - - +あいう xあいうy y $& あいう +あいう あいあいう y $& あいう +あい*う あいう y $& あいう +あい*いう あいう y $& あいう +あい*いう あいいう y $& あいいう +あい*いう あいいいいう y $& あいいいいう +あい{0,}いう あいいいいう y $& あいいいいう +あい+いう あいいう y $& あいいう +あい+いう あいう n - - +あい+いう あいq n - - +あい{1,}いう あいq n - - +あい+いう あいいいいう y $& あいいいいう +あい{1,}いう あいいいいう y $& あいいいいう +あい{1,3}いう あいいいいう y $& あいいいいう +あい{3,4}いう あいいいいう y $& あいいいいう +あい{4,5}いう あいいいいう n - - +あい?いう あいいう y $& あいいう +あい?いう あいう y $& あいう +あい{0,1}いう あいう y $& あいう +あい?いう あいいいいう n - - +あい?う あいう y $& あいう +あい{0,1}う あいう y $& あいう +^あいう$ あいう y $& あいう +^あいう$ あいうう n - - +^あいう あいうう y $& あいう +^あいう$ ああいう n - - +あいう$ ああいう y $& あいう +^ あいう y $& +$ あいう y $& +あ.う あいう y $& あいう +あ.う あxう y $& あxう +あ.*う あxyzう y $& あxyzう +あ.*う あxyzえ n - - +あ[いう]え あいう n - - +あ[いう]え あいえ y $& あいえ +あ[い-え]e あいえ n - - +あ[い-え]e あうe y $& あうe +あ[い-え] ああう y $& あう +あ[-い] あ- y $& あ- +あ[い-] あ- y $& あ- +あ[い-あ] - c - - +あ[]い - c - - +あ[ - c - - +あ] あ] y $& あ] +あ[]]い あ]い y $& あ]い +あ[^いう]え あeえ y $& あeえ +あ[^いう]え あいえ n - - +あ[^-い]う あえう y $& あえう +あ[^-い]う あ-う n - - +あ[^]い]う あ]う n - - +あ[^]い]う あえう y $& あえう +あい|うえ あいう y $& あい +あい|うえ あいうえ y $& あい +()ef えef y $&-$1 ef- +*あ - c - - +(*)い - c - - +$い い n - - +あ\ - c - - +あ\(い あ(い y $&-$1 あ(い- +あ\(*い あい y $& あい +あ\(*い あ((い y $& あ((い +あ\\い あ\い y $& あ\い +あいう) - c - - +(あいう - c - - +((あ)) あいう y $&-$1-$2 あ-あ-あ +(あ)い(う) あいう y $&-$1-$2 あいう-あ-う +あ+い+う ああいいあいう y $& あいう +あ{1,}い{1,}う ああいいあいう y $& あいう +あ** - c - - +あ.+?う あいうあいう y $& あいう +(あ+|い)* あい y $&-$1 あい-い +(あ+|い){0,} あい y $&-$1 あい-い +(あ+|い)+ あい y $&-$1 あい-い +(あ+|い){1,} あい y $&-$1 あい-い +(あ+|い)? あい y $&-$1 あ-あ +(あ+|い){0,1} あい y $&-$1 あ-あ +)( - c - - +[^あい]* うえe y $& うえe +あいう n - - +あ* y $& +([あいう])*え あいいいうえ y $&-$1 あいいいうえ-う +([あいう])*いうえ あいうえ y $&-$1 あいうえ-あ +あ|い|う|え|e e y $& e +(あ|い|う|え|e)f ef y $&-$1 ef-e +あいうえ*efg あいうえefg y $& あいうえefg +あい* xあいyあいいいz y $& あい +あい* xあyあいいいz y $& あ +(あい|うえ)e あいうえe y $&-$1 うえe-うえ +[あいhgefえう]ij hij y $& hij +^(あい|うえ)e あいうえe n x$1y xy +(あいう|)ef あいうえef y $&-$1 ef- +(あ|い)う*え あいうえ y $&-$1 いうえ-い +(あい|あい*)いう あいう y $&-$1 あいう-あ +あ([いう]*)う* あいう y $&-$1 あいう-いう +あ([いう]*)(う*え) あいうえ y $&-$1-$2 あいうえ-いう-え +あ([いう]+)(う*え) あいうえ y $&-$1-$2 あいうえ-いう-え +あ([いう]*)(う+え) あいうえ y $&-$1-$2 あいうえ-い-うえ +あ[いうえ]*えうえうえe あえうえうえe y $& あえうえうえe +あ[いうえ]+えうえうえe あえうえうえe n - - +(あい|あ)い*う あいう y $&-$1 あいう-あい +((あ)(い)う)(え) あいうえ y $1-$2-$3-$4 あいう-あ-い-え +[あ-おさ-そ][あ-えさ-せ]* あえさす y $& あえさす +^あ(いう+|い[eh])g|.h$ あいh y $&-$1 いh- +(いう+え$|ef*g.|h?i(j|k)) effgz y $&-$1-$2 effgz-effgz- +(いう+え$|ef*g.|h?i(j|k)) ij y $&-$1-$2 ij-ij-j +(いう+え$|ef*g.|h?i(j|k)) effg n - - +(いう+え$|ef*g.|h?i(j|k)) いうええ n - - +(いう+え$|ef*g.|h?i(j|k)) reffgz y $&-$1-$2 effgz-effgz- +((((((((((あ)))))))))) あ y $10 あ +((((((((((あ))))))))))\10 ああ y $& ああ +((((((((((あ))))))))))\41 ああ n - - +((((((((((あ))))))))))\41 あ! y $& あ! +(((((((((あ))))))))) あ y $& あ +(.*)う(.*) あいうえe y $&-$1-$2 あいうえe-あい-えe +\((.*), (.*)\) (あ, い) y ($2, $1) (い, あ) +[k] あい n - - +あいうえ あいうえ y $&-\$&-\\$& あいうえ-$&-\あいうえ +あ(いう)え あいうえ y $1-\$1-\\$1 いう-$1-\いう +あ[-]?う あう y $& あう +(あいう)\1 あいうあいう y $1 あいう +([あ-う]*)\1 あいうあいう y $1 あいう +'あいう'i あいう y $& あいう +'あいう'i Xいう n - - +'あいう'i あXう n - - +'あいう'i あいX n - - +'あいう'i XあいうY y $& あいう +'あいう'i あいあいう y $& あいう +'あい*う'i あいう y $& あいう +'あい*いう'i あいう y $& あいう +'あい*いう'i あいいう y $& あいいう +'あい*?いう'i あいいいいう y $& あいいいいう +'あい{0,}?いう'i あいいいいう y $& あいいいいう +'あい+?いう'i あいいう y $& あいいう +'あい+いう'i あいう n - - +'あい+いう'i あいQ n - - +'あい{1,}いう'i あいQ n - - +'あい+いう'i あいいいいう y $& あいいいいう +'あい{1,}?いう'i あいいいいう y $& あいいいいう +'あい{1,3}?いう'i あいいいいう y $& あいいいいう +'あい{3,4}?いう'i あいいいいう y $& あいいいいう +'あい{4,5}?いう'i あいいいいう n - - +'あい??いう'i あいいう y $& あいいう +'あい??いう'i あいう y $& あいう +'あい{0,1}?いう'i あいう y $& あいう +'あい??いう'i あいいいいう n - - +'あい??う'i あいう y $& あいう +'あい{0,1}?う'i あいう y $& あいう +'^あいう$'i あいう y $& あいう +'^あいう$'i あいうう n - - +'^あいう'i あいうう y $& あいう +'^あいう$'i ああいう n - - +'あいう$'i ああいう y $& あいう +'$'i あいう y $& +'あ.う'i あいう y $& あいう +'あ.う'i あXう y $& あXう +'あ.*?う'i あXYZう y $& あXYZう +'あ[いう]え'i あいう n - - +'あ[いう]え'i あいえ y $& あいえ +'あ[い-え]'i ああう y $& あう +'あ[-い]'i あ- y $& あ- +'あ[い-]'i あ- y $& あ- +'あ[い-あ]'i - c - - +'あ[]い'i - c - - +'あ['i - c - - +'あ]'i あ] y $& あ] +'あ[]]い'i あ]い y $& あ]い +'あ[^いう]え'i あそえ y $& あそえ +'あ[^いう]え'i あいえ n - - +'あ[^-い]う'i あえう y $& あえう +'あ[^-い]う'i あ-う n - - +'あ[^]い]う'i あ]う n - - +'あ[^]い]う'i あえう y $& あえう +'あい|うえ'i あいう y $& あい +'あい|うえ'i あいうえ y $& あい +'*あ'i - c - - +'(*)い'i - c - - +'$い'i い n - - +'あ\'i - c - - +'あ\(い'i あ(い y $&-$1 あ(い- +'あ\(*い'i あい y $& あい +'あ\(*い'i あ((い y $& あ((い +'あいう)'i - c - - +'(あいう'i - c - - +'((あ))'i あいう y $&-$1-$2 あ-あ-あ +'(あ)い(う)'i あいう y $&-$1-$2 あいう-あ-う +'あ+い+う'i ああいいあいう y $& あいう +'あ{1,}い{1,}う'i ああいいあいう y $& あいう +'あ**'i - c - - +'あ.+?う'i あいうあいう y $& あいう +'あ.*?う'i あいうあいう y $& あいう +'あ.{0,5}?う'i あいうあいう y $& あいう +'(あ+|い)*'i あい y $&-$1 あい-い +'(あ+|い){0,}'i あい y $&-$1 あい-い +'(あ+|い)+'i あい y $&-$1 あい-い +'(あ+|い){1,}'i あい y $&-$1 あい-い +'(あ+|い)?'i あい y $&-$1 あ-あ +'(あ+|い){0,1}'i あい y $&-$1 あ-あ +'(あ+|い){0,1}?'i あい y $&-$1 - +')('i - う - - +'[^あい]*'i うDE y $& うDE +'あいう'i n - - +'あ*'i y $& +'([あいう])*え'i あいいいうえ y $&-$1 あいいいうえ-う +'([あいう])*いうえ'i あいうえ y $&-$1 あいうえ-あ +'あ|い|う|え|e'i E y $& E +'(あ|い|う|え|e)f'i EF y $&-$1 EF-E +'あいうd*eふg'i あいうDEふG y $& あいうDEふG +'あい*'i XあいYあいいいZ y $& あい +'あい*'i XあYあいいいZ y $& あ +'(あい|うえ)e'i あいうえE y $&-$1 うえE-うえ +'[あいhgefえう]ij'i HIJ y $& HIJ +'^(あい|うえ)e'i あいうDE n x$1y XY +'(あいう|)ef'i あいうDEF y $&-$1 EF- +'(あ|い)う*D'i あいうD y $&-$1 いうD-い +'(あ|い)う*D'i あいううD y $&-$1 いううD-い +'(あい|あい*)いう'i あいう y $&-$1 あいう-あ +'あ([いう]*)う*'i あいう y $&-$1 あいう-いう +'あ([いう]*)(う*D)'i あいうD y $&-$1-$2 あいうD-いう-D +'あ([いう]+)(う*D)'i あいうD y $&-$1-$2 あいうD-いう-D +'あ([いう]*)(う+D)'i あいうD y $&-$1-$2 あいうD-い-うD +'あ[いうD]*DうDうDe'i あDうDうDE y $& あDうDうDE +'あ[いうD]+DうDうDe'i あDうDうDE n - - +'(あい|あ)い*う'i あいう y $&-$1 あいう-あい +'((あ)(い)う)(D)'i あいうD y $1-$2-$3-$4 あいう-あ-い-D +'[a-zA-Z_][a-zA-Z0-9_]*'i ALPHa y $& ALPHa +'^あ(いう+|い[eh])g|.h$'i あいH y $&-$1 いH- +'(いう+え$|ef*g.|h?i(j|k))'i EFFGZ y $&-$1-$2 EFFGZ-EFFGZ- +'(いう+え$|ef*g.|h?i(j|k))'i IJ y $&-$1-$2 IJ-IJ-J +'(いう+え$|ef*g.|h?i(j|k))'i EFFG n - - +'(いう+え$|ef*g.|h?i(j|k))'i いうDD n - - +'(いう+え$|ef*g.|h?i(j|k))'i REFFGZ y $&-$1-$2 EFFGZ-EFFGZ- +'((((((((((あ))))))))))'i あ y $10 あ +'(((((((((あ)))))))))'i あ y $& あ +'(?:(?:(?:(?:(?:(?:(?:(?:(?:(あ))))))))))'i あ y $1 あ +'(?:(?:(?:(?:(?:(?:(?:(?:(?:(あ|い|う))))))))))'i う y $1 う +'(.*)う(.*)'i あいうDE y $&-$1-$2 あいうDE-あい-DE +'\((.*), (.*)\)'i (あ, い) y ($2, $1) (い, あ) +'[k]'i あい n - - +'あいうD'i あいうD y $&-\$&-\\$& あいうD-$&-\あいうD +'あ(いう)D'i あいうD y $1-\$1-\\$1 いう-$1-\いう +'あ[-]?う'i あう y $& あう +'(あいう)\1'i あいうあいう y $1 あいう +'([あ-う]*)\1'i あいうあいう y $1 あいう +あ(?!い). あいあえ y $& あえ +あ(?=え). あいあえ y $& あえ +あ(?=う|え). あいあえ y $& あえ +あ(?:い|う|え)(.) あうお y $1 お +あ(?:い|う|え)*(.) あうe y $1 e +あ(?:い|う|え)+?(.) あうお y $1 お +あ(?:い|(う|e){1,2}?|え)+?(.) あうe y $1$2 うe +^(.+)?い あい y $1 あ diff -urN perl5.004_02/t/kanji/regexp.t jperl5.004_02/t/kanji/regexp.t --- perl5.004_02/t/kanji/regexp.t Thu Jan 1 09:00:00 1970 +++ jperl5.004_02/t/kanji/regexp.t Fri Aug 8 10:07:23 1997 @@ -0,0 +1,37 @@ +#!./perl +# $RCSfile: regexp.t,v $$Revision: 4.1 $$Date: 92/08/07 18:28:20 $ +BEGIN { + $^H |= 0xf000; +} + +open(TESTS,'kanji/re_tests') || open(TESTS,'t/op/re_tests') + || die "Can't open re_tests"; +while () { } +$numtests = $.; +close(TESTS); + +print "1..$numtests\n"; +open(TESTS,'kanji/re_tests') || open(TESTS,'t/kanji/re_tests') + || die "Can't open re_tests"; +$| = 1; +while () { + ($pat, $subject, $result, $repl, $expect) = split(/[\t\n]/,$_); + $input = join(':',$pat,$subject,$result,$repl,$expect); + $pat = "'$pat'" unless $pat =~ /^'/; + eval "\$match = (\$subject =~ m$pat); \$got = \"$repl\";"; + if ($result eq 'c') { + if ($@ ne '') {print "ok $.\n";} else {print "not ok $.\n";} + } + elsif ($result eq 'n') { + if (!$match) {print "ok $.\n";} else {print "not ok $. $input => $got\n";} + } + else { + if ($match && $got eq $expect) { + print "ok $.\n"; + } + else { + print "not ok $. $input => $got\n"; + } + } +} +close(TESTS); diff -urN perl5.004_02/t/kanji/sjis.t jperl5.004_02/t/kanji/sjis.t --- perl5.004_02/t/kanji/sjis.t Thu Jan 1 09:00:00 1970 +++ jperl5.004_02/t/kanji/sjis.t Fri Aug 8 10:07:23 1997 @@ -0,0 +1,3 @@ +#!./perl +print "1..1\n"; +print "ok 1\n"; diff -urN perl5.004_02/toke.c jperl5.004_02/toke.c --- perl5.004_02/toke.c Tue Aug 5 22:26:01 1997 +++ jperl5.004_02/toke.c Fri Aug 8 10:07:23 1997 @@ -766,18 +766,33 @@ if (lex_inwhat == OP_TRANS) { if (dorange) { I32 i; - I32 max; + I32 max, min; i = d - SvPVX(sv); - SvGROW(sv, SvLEN(sv) + 256); - d = SvPVX(sv) + i; - d -= 2; - max = (U8)d[1]; - for (i = (U8)*d; i <= max; i++) - *d++ = i; + + if ((hints & HINT_KANJI_TR) + && kpart(SvPVX(sv), d-1) == KPART_KANJI_2 + && kpart(SvPVX(sv), d-3) == KPART_KANJI_2) { + /* We have a kanji-range here */ + max = twochar_to_int(d[-2], d[-1]); + min = twochar_to_int(d[-4], d[-3]); + SvGROW(sv, SvLEN(sv) + (max-min)*2); + d = SvPVX(sv) + i; + d -= 4; + for (i = min; i <= max; i = jnextcode(i)) { + *d++ = int_to_1stbyte(i); + *d++ = int_to_2ndbyte(i); + } + } else { + SvGROW(sv, SvLEN(sv) + 256); + d = SvPVX(sv) + i; + d -= 2; + max = d[1] & 0377; + for (i = (*d & 0377); i <= max; i++) + *d++ = i; + } dorange = FALSE; continue; - } - else if (*s == '-' && s+1 < send && s != start) { + } else if (*s == '-' && s+1 < send && s != start) { dorange = TRUE; s++; } @@ -832,10 +847,14 @@ case '4': case '5': case '6': case '7': *d++ = scan_oct(s, 3, &len); s += len; + if ((hints & HINT_KANJI_TR) && lex_inwhat == OP_TRANS) + yylval.opval->op_private &= ~OPpTRANS_KANJI; continue; case 'x': *d++ = scan_hex(++s, 2, &len); s += len; + if ((hints & HINT_KANJI_TR) && lex_inwhat == OP_TRANS) + yylval.opval->op_private &= ~OPpTRANS_KANJI; continue; case 'c': s++; @@ -867,6 +886,8 @@ s++; continue; } + if ((hints & HINT_KANJI_STRING) && iskanji(*s) && s < send) + *d++ = *s++; *d++ = *s++; } *d = '\0'; @@ -4732,8 +4753,8 @@ croak("Translation replacement not terminated"); } - New(803,tbl,256,short); - op = newPVOP(OP_TRANS, 0, (char*)tbl); + op = newPVOP(OP_TRANS, 0, 0); + /* tbl is allocated at pmtrans since it's size is is no longer fixed */ complement = delete = squash = 0; while (*s == 'c' || *s == 'd' || *s == 's') { @@ -4741,12 +4762,13 @@ complement = OPpTRANS_COMPLEMENT; else if (*s == 'd') delete = OPpTRANS_DELETE; - else + else squash = OPpTRANS_SQUASH; s++; } op->op_private = delete|squash|complement; - + if (hints & HINT_KANJI_TR) + op->op_private |= OPpTRANS_KANJI; lex_op = op; yylval.ival = OP_TRANS; return s; @@ -4978,9 +5000,11 @@ s++; else *to++ = *s++; - } - else if (*s == term) + } else if (*s == term) { break; + } else if ((hints & HINT_KANJI_STRING) && iskanji(*s) && s < bufend-1) { + *to++ = *s++; + } *to = *s; } } @@ -4998,6 +5022,8 @@ break; else if (*s == multi_open) brackets++; + else if ((hints & HINT_KANJI_STRING) && iskanji(*s) && s < bufend-1) + *to++ = *s++; *to = *s; } } @@ -5190,6 +5216,11 @@ eol = bufend = SvPVX(linestr) + SvCUR(linestr); if (*s != '#') { for (t = s; t < eol; t++) { + if ((hints & HINT_KANJI_STRING) && iskanji(*t)) { + t++; + continue; + } + if (*t == '~' && t[1] == '~' && SvCUR(stuff)) { needargs = FALSE; goto enough; /* ~~ must be first line in formline */ @@ -5223,6 +5254,8 @@ else lex_state = LEX_FORMLINE; nextval[nexttoke].opval = (OP*)newSVOP(OP_CONST, 0, stuff); + nextval[nexttoke].opval->op_private |= + ((hints & HINT_KANJI_FORMAT) ? 8 : 0); force_next(THING); nextval[nexttoke].ival = OP_FORMLINE; force_next(LSTOP); diff -urN perl5.004_02/util.c jperl5.004_02/util.c --- perl5.004_02/util.c Tue Jul 29 09:44:50 1997 +++ jperl5.004_02/util.c Fri Aug 8 10:07:23 1997 @@ -323,6 +323,7 @@ { register char *s, *x; register I32 first; + char *tops = big; if (!little) return big; @@ -330,6 +331,10 @@ if (!first) return big; while (*big) { + if ((hints & HINT_KANJI_STRING) && kpart(tops, big) == KPART_KANJI_2) { + big++; + continue; + } if (*big++ != first) continue; for (x=big,s=little; *s; /**/ ) { @@ -358,6 +363,7 @@ register char *s, *x; register I32 first = *little; register char *littleend = lend; + char *tops = big; if (!first && little >= littleend) return big; @@ -365,6 +371,10 @@ return Nullch; bigend -= littleend - little++; while (big <= bigend) { + if ((hints & HINT_KANJI_STRING) && kpart(tops, big) == KPART_KANJI_2) { + big++; + continue; + } if (*big++ != first) continue; for (x=big,s=little; s < littleend; /**/ ) { @@ -398,6 +408,10 @@ bigbeg = big; big = bigend - (littleend - little++); while (big >= bigbeg) { + if ((hints & HINT_KANJI_STRING) && kpart(bigbeg, big) == KPART_KANJI_2) { + big--; + continue; + } if (*big-- != first) continue; for (x=big+2,s=little; s < littleend; /**/ ) { @@ -895,6 +909,7 @@ register unsigned char *table; register unsigned char *olds; register unsigned char *oldlittle; + unsigned char *tops = big; if (SvTYPE(littlestr) != SVt_PVBM || !SvVALID(littlestr)) { STRLEN len; @@ -910,12 +925,18 @@ return Nullch; little = (unsigned char*)SvPVX(littlestr); s = bigend - littlelen; - if (*s == *little && memEQ((char*)s,(char*)little,littlelen)) + if (*s == *little + && (!(hints & HINT_KANJI_STRING) + || kpart((char *)tops,(char *)s)!=KPART_KANJI_2) + && memEQ((char*)s,(char*)little,littlelen)) return (char*)s; /* how sweet it is */ else if (bigend[-1] == '\n' && little[littlelen-1] != '\n' && s > big) { s--; - if (*s == *little && memEQ((char*)s,(char*)little,littlelen)) + if (*s == *little + && (!(hints & HINT_KANJI_STRING) + || kpart((char *)tops,(char *)s)!=KPART_KANJI_2) + && memEQ((char*)s,(char*)little,littlelen)) return (char*)s; } return Nullch; @@ -952,6 +973,14 @@ goto top2; return Nullch; } + if ((hints & HINT_KANJI_STRING) + && kpart((char *)tops,(char *)s) == 2) { + s = olds + 1; + little = oldlittle; + if (s < bigend) + goto top2; + return Nullch; + } return (char *)s; } } @@ -1019,6 +1048,8 @@ return Nullch; } +#define ISKANJI(c) ((hints & HINT_KANJI_REGEXP) && iskanji(c)) + I32 ibcmp(s1, s2, len) char *s1, *s2; @@ -1029,6 +1060,12 @@ while (len--) { if (*a != *b && *a != fold[*b]) return 1; + if (ISKANJI(*a) && a[1] && b[1]) { + len--; + a++,b++; + if (*a != *b) + return 1; + } a++,b++; } return 0; @@ -1044,6 +1081,12 @@ while (len--) { if (*a != *b && *a != fold_locale[*b]) return 1; + if (ISKANJI(*a) && a[1] && b[1]) { + len--; + a++,b++; + if (*a != *b) + return 1; + } a++,b++; } return 0; diff -urN perl5.004_02/win32/Makefile jperl5.004_02/win32/Makefile --- perl5.004_02/win32/Makefile Tue Jul 29 09:02:36 1997 +++ jperl5.004_02/win32/Makefile Fri Aug 8 10:07:23 1997 @@ -163,7 +163,8 @@ ..\taint.c \ ..\toke.c \ ..\universal.c \ - ..\util.c + ..\util.c \ + ..\kanji.c CORE_OBJ= ..\av.obj \ ..\deb.obj \ @@ -190,7 +191,8 @@ ..\taint.obj \ ..\toke.obj \ ..\universal.obj\ - ..\util.obj + ..\util.obj \ + ..\kanji.obj WIN32_C = perllib.c \ win32.c \