870 lines
32 KiB
C
870 lines
32 KiB
C
/*-
|
|
* Copyright (c) 1998, 2002-2008 Kiyoshi Matsui <kmatsui@t3.rim.or.jp>
|
|
* All rights reserved.
|
|
*
|
|
* Some parts of this code are derived from the public domain software
|
|
* DECUS cpp (1984,1985) written by Martin Minow.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* M B C H A R . C
|
|
* C h a r a c t e r h a n d l i n g R o u t i n e s
|
|
*
|
|
* Character handling and multi-byte character handling routines are
|
|
* placed here.
|
|
*/
|
|
|
|
#if PREPROCESSED
|
|
#include "mcpp.H"
|
|
#else
|
|
#include "system.H"
|
|
#include "internal.H"
|
|
#endif
|
|
|
|
/*
|
|
* Tables of character types and multi-byte character types.
|
|
*
|
|
* Some of these character attributes will be overwritten by
|
|
* execution time option '-@post' or '-@old'.
|
|
* Warning on erroneous sequence will be issued from the caller routines:
|
|
* scan_quote(), scan_id() or scan_number().
|
|
*/
|
|
|
|
/* Non-ASCII characters are always checked by mb_read(). */
|
|
#define NA 0x4000 /* Non-ASCII characters */
|
|
|
|
/* Horizontal spaces (' ', '\t' and TOK_SEP) */
|
|
#define HSPA (SPA | HSP)
|
|
|
|
short * char_type; /* Pointer to one of the following type_*[]. */
|
|
|
|
#define EJ1 0x100 /* 1st byte of EUC_JP */
|
|
#define EJ2 0x200 /* 2nd byte of EUC_JP */
|
|
#define GB1 0x400 /* 1st byte of GB2312 */
|
|
#define GB2 0x800 /* 2nd byte of GB2312 */
|
|
#define KS1 0x1000 /* 1st byte of KSC5601 */
|
|
#define KS2 0x2000 /* 2nd byte of KSC5601 */
|
|
|
|
#define EJ12 (EJ1 | EJ2) /* 1st byte or 2nd byte of EUC_JP */
|
|
#define GB12 (GB1 | GB2)
|
|
#define KS12 (KS1 | KS2)
|
|
#define EJ1N (NA | EJ1)
|
|
#define EU12N (NA | EJ12 | GB12 | KS12)
|
|
/* 1st or 2nd byte of EUC_JP, GB2312 or KSC5601, or any other non-ASCII */
|
|
|
|
static short type_euc[ UCHARMAX + 1] = {
|
|
/*
|
|
* For EUC_JP, GB2312, KSC5601 or other similar multi-byte char encodings.
|
|
*/
|
|
|
|
/* Character type codes */
|
|
/* 0, 1, 2, 3, 4, 5, 6, 7, */
|
|
/* 8, 9, A, B, C, D, E, F, Hex */
|
|
|
|
000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
|
|
000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
|
|
000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
|
|
/* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
|
|
000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */
|
|
HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */
|
|
PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */
|
|
DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */
|
|
DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */
|
|
|
|
000, LET, LET, LET, LET, LET, LET, LET, /* 40 @ABCDEFG */
|
|
LET, LET, LET, LET, LET, LET, LET, LET, /* 48 HIJKLMNO */
|
|
LET, LET, LET, LET, LET, LET, LET, LET, /* 50 PQRSTUVW */
|
|
LET, LET, LET, PUNC, 000, PUNC, PUNC, LET, /* 58 XYZ[\]^_ */
|
|
000, LET, LET, LET, LET, LET, LET, LET, /* 60 `abcdefg */
|
|
LET, LET, LET, LET, LET, LET, LET, LET, /* 68 hijklmno */
|
|
LET, LET, LET, LET, LET, LET, LET, LET, /* 70 pqrstuvw */
|
|
LET, LET, LET, PUNC, PUNC, PUNC, PUNC, 000, /* 78 xyz{|}~ */
|
|
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* 80 .. 87 */
|
|
NA, NA, NA, NA, NA, NA, EJ1N, NA, /* 88 .. 8F */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* 90 .. 97 */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* 98 .. 9F */
|
|
NA, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* A0 .. A7 */
|
|
EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* A8 .. AF */
|
|
EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* B0 .. B7 */
|
|
EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* B8 .. BF */
|
|
EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* C0 .. C7 */
|
|
EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* C8 .. CF */
|
|
EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* D0 .. D7 */
|
|
EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* D8 .. DF */
|
|
EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* E0 .. E7 */
|
|
EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* E8 .. EF */
|
|
EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* F0 .. F7 */
|
|
EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, NA, /* F8 .. FF */
|
|
};
|
|
|
|
static short type_bsl[ UCHARMAX + 1] = {
|
|
/*
|
|
* For SJIS, BIGFIVE or other similar encodings which may have '\\' value as
|
|
* the second byte of multi-byte character.
|
|
*/
|
|
|
|
#define SJ1 0x100 /* 1st byte of SJIS */
|
|
#define SJ2 0x200 /* 2nd byte of SJIS */
|
|
#define BF1 0x400 /* 1st byte of BIGFIVE */
|
|
#define BF2 0x800 /* 2nd byte of BIGFIVE */
|
|
|
|
#define SB2 (SJ2 | BF2)
|
|
#define SJ2N (NA | SJ2)
|
|
#define SB2N (NA | SJ2 | BF2)
|
|
#define SJ12N (NA | SJ1 | SJ2)
|
|
#define BF12N (NA | BF1 | BF2)
|
|
#define SB12N (NA | SJ1 | SJ2 | BF1 | BF2)
|
|
#define S2B12N (NA | SJ2 | BF1 | BF2)
|
|
|
|
#define LSB2 (LET | SB2)
|
|
#define PSB2 (PUNC| SB2)
|
|
|
|
/* Character type codes */
|
|
/* 0, 1, 2, 3, 4, 5, 6, 7, */
|
|
/* 8, 9, A, B, C, D, E, F, Hex */
|
|
|
|
000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
|
|
000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
|
|
000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
|
|
/* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
|
|
000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */
|
|
HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */
|
|
PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */
|
|
DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */
|
|
DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */
|
|
|
|
SB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 40 @ABCDEFG */
|
|
LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 48 HIJKLMNO */
|
|
LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 50 PQRSTUVW */
|
|
LSB2, LSB2, LSB2, PSB2, SB2, PSB2, PSB2, LSB2, /* 58 XYZ[\]^_ */
|
|
SB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 60 `abcdefg */
|
|
LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 68 hijklmno */
|
|
LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 70 pqrstuvw */
|
|
LSB2, LSB2, LSB2, PSB2, PSB2, PSB2, PSB2, 000, /* 78 xyz{|}~ */
|
|
|
|
SB2N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 80 .. 87 */
|
|
SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 88 .. 8F */
|
|
SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 90 .. 97 */
|
|
SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 98 .. 9F */
|
|
SJ2N, S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* A0 .. A7 */
|
|
S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* A8 .. AF */
|
|
S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* B0 .. B7 */
|
|
S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* B8 .. BF */
|
|
S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* C0 .. C7 */
|
|
S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* C8 .. CF */
|
|
S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* D0 .. D7 */
|
|
S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* D8 .. DF */
|
|
SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* E0 .. E7 */
|
|
SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* E8 .. EF */
|
|
SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* F0 .. F7 */
|
|
SB12N, SB12N, SB12N, SB12N, SB12N, BF12N, BF12N, NA, /* F8 .. FF */
|
|
};
|
|
|
|
/*
|
|
* For ISO2022_JP multi-byte character encoding.
|
|
*/
|
|
|
|
#define IS1 0x100 /* 1st byte of shift-sequence */
|
|
#define IS2 0x200 /* 2nd byte of shift-sequence */
|
|
#define IS3 0x400 /* 3rd byte of shift-sequence */
|
|
#define IS4 0x800 /* 4th byte of shift-sequence */
|
|
#define IJP 0x1000 /* 1st or 2nd byte of ISO-2022-JP (ISO-2022-JP1) */
|
|
|
|
#define PIJP (PUNC | IJP)
|
|
#define QIJP (QUO | IJP)
|
|
#define DTJP (DOT | IJP)
|
|
#define DGJP (DIG | IJP)
|
|
#define LIJP (LET | IJP)
|
|
|
|
#define JPS2 (IJP | IS2)
|
|
#define PJPS23 (PIJP | IS2 | IS3)
|
|
#define LJPS3 (LIJP | IS3)
|
|
#define LJPS4 (LIJP | IS4)
|
|
|
|
static short type_iso2022_jp[ UCHARMAX + 1] = {
|
|
|
|
/* Character type codes */
|
|
/* 0, 1, 2, 3, 4, 5, 6, 7, */
|
|
/* 8, 9, A, B, C, D, E, F, Hex */
|
|
|
|
000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
|
|
000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
|
|
000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
|
|
/* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
|
|
000, LET, LET, IS1, 000, 000, 000, HSPA, /* 18 */
|
|
HSPA, PIJP, QIJP, PIJP, JPS2, PIJP, PIJP, QIJP, /* 20 !"#$%&' */
|
|
PJPS23,PIJP, PIJP, PIJP, PIJP, PIJP, DTJP, PIJP, /* 28 ()*+,-./ */
|
|
DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, /* 30 01234567 */
|
|
DGJP, DGJP, PIJP, PIJP, PIJP, PIJP, PIJP, PIJP, /* 38 89:;<=>? */
|
|
|
|
IJP, LIJP, LJPS3, LIJP, LJPS4, LIJP, LIJP, LIJP, /* 40 @ABCDEFG */
|
|
LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 48 HIJKLMNO */
|
|
LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 50 PQRSTUVW */
|
|
LIJP, LIJP, LIJP, PIJP, IJP, PIJP, PIJP, LIJP, /* 58 XYZ[\]^_ */
|
|
IJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 60 `abcdefg */
|
|
LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 68 hijklmno */
|
|
LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 70 pqrstuvw */
|
|
LIJP, LIJP, LIJP, PIJP, PIJP, PIJP, PIJP, 000, /* 78 xyz{|}~ */
|
|
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* 80 .. 87 */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* 88 .. 8F */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* 90 .. 97 */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* 98 .. 9F */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* A0 .. A7 */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* A8 .. AF */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* B0 .. B7 */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* B8 .. BF */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* C0 .. C7 */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* C8 .. CF */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* D0 .. D7 */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* D8 .. DF */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* E0 .. E7 */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* E8 .. EF */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* F0 .. F7 */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* F8 .. FF */
|
|
};
|
|
|
|
/*
|
|
* For UTF8 multi-byte character encoding.
|
|
*/
|
|
|
|
#define U2_1 0x100 /* 1st byte of 2-byte encoding of UTF8 */
|
|
#define U3_1 0x200 /* 1st byte of 3-byte encoding of UTF8 */
|
|
#define U4_1 0x400 /* 1st byte of 4-byte encoding of UTF8 */
|
|
#define UCONT 0x800 /* Continuation of a 2, 3, or 4 byte UTF8 sequence */
|
|
#define U2_1N (NA | U2_1)
|
|
#define U3_1N (NA | U3_1)
|
|
#define U4_1N (NA | U4_1)
|
|
#define UCONTN (NA | UCONT)
|
|
|
|
static short type_utf8[ UCHARMAX + 1] = {
|
|
|
|
/* Character type codes */
|
|
/* 0, 1, 2, 3, 4, 5, 6, 7, */
|
|
/* 8, 9, A, B, C, D, E, F, Hex */
|
|
|
|
000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
|
|
000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
|
|
000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
|
|
/* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
|
|
000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */
|
|
HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */
|
|
PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */
|
|
DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */
|
|
DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */
|
|
|
|
000, LET, LET, LET, LET, LET, LET, LET, /* 40 @ABCDEFG */
|
|
LET, LET, LET, LET, LET, LET, LET, LET, /* 48 HIJKLMNO */
|
|
LET, LET, LET, LET, LET, LET, LET, LET, /* 50 PQRSTUVW */
|
|
LET, LET, LET, PUNC, 000, PUNC, PUNC, LET, /* 58 XYZ[\]^_ */
|
|
000, LET, LET, LET, LET, LET, LET, LET, /* 60 `abcdefg */
|
|
LET, LET, LET, LET, LET, LET, LET, LET, /* 68 hijklmno */
|
|
LET, LET, LET, LET, LET, LET, LET, LET, /* 70 pqrstuvw */
|
|
LET, LET, LET, PUNC, PUNC, PUNC, PUNC, 000, /* 78 xyz{|}~ */
|
|
|
|
UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 80 .. 87 */
|
|
UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 88 .. 8F */
|
|
UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 90 .. 97 */
|
|
UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 98 .. 9F */
|
|
UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* A0 .. A7 */
|
|
UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* A8 .. AF */
|
|
UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* B0 .. B7 */
|
|
UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* B8 .. BF */
|
|
NA, NA, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* C0 .. C7 */
|
|
U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* C8 .. CF */
|
|
U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* D0 .. D7 */
|
|
U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* D8 .. DF */
|
|
U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, /* E0 .. E7 */
|
|
U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, /* E8 .. EF */
|
|
U4_1N, U4_1N, U4_1N, U4_1N, U4_1N, NA, NA, NA, /* F0 .. F7 */
|
|
NA, NA, NA, NA, NA, NA, NA, NA, /* F8 .. FF */
|
|
};
|
|
|
|
#define SETLOCALE 2 /* #pragma setlocale (not __setlocale) */
|
|
|
|
#define NUM_ENCODING 8
|
|
#define NUM_ALIAS 6
|
|
|
|
/* Names of encoding recognized. Table for search_encoding(). */
|
|
static const char * const encoding_name[ NUM_ENCODING][ NUM_ALIAS] = {
|
|
/* Visual C full, Visual C short
|
|
, 4 miscellaneous */
|
|
{ "english", "c"
|
|
, "c", "en", "latin", "iso8859"},
|
|
{ "", ""
|
|
, "eucjp", "euc", "ujis", ""},
|
|
{ "chinesesimplified", "chs"
|
|
, "gb2312", "cngb", "euccn", ""},
|
|
{ "korean", "kor"
|
|
, "ksc5601", "ksx1001", "wansung", "euckr"},
|
|
{ "japanese", "jpn"
|
|
, "sjis", "shiftjis", "mskanji", ""},
|
|
{ "chinesetraditional", "cht"
|
|
, "bigfive", "big5", "cnbig5", "euctw"},
|
|
{ "", ""
|
|
, "iso2022jp", "iso2022jp1", "jis", ""},
|
|
{ "", ""
|
|
, "utf8", "utf", "", ""},
|
|
};
|
|
|
|
static int mbstart;
|
|
static int mb2;
|
|
|
|
static size_t mb_read_2byte( int c1, char ** in_pp, char ** out_pp);
|
|
/* For 2-byte encodings of mbchar */
|
|
static const char * search_encoding( char * norm, int alias);
|
|
/* Search encoding_name[][] table */
|
|
static void strip_bar( char * string);
|
|
/* Remove '_', '-' or '.' in the string */
|
|
static void conv_case( char * name, char * lim, int upper);
|
|
/* Convert to upper/lower case */
|
|
static size_t mb_read_iso2022_jp( int c1, char ** in_pp, char ** out_pp);
|
|
/* For ISO2022_JP encoding */
|
|
static size_t mb_read_utf8( int c1, char ** in_pp, char ** out_pp);
|
|
/* For UTF8 mbchar encoding */
|
|
|
|
#define NAMLEN 20
|
|
#define UPPER 1 /* To upper */
|
|
#define LOWER 0 /* To lower */
|
|
|
|
|
|
const char * set_encoding(
|
|
char * name, /* Name of encoding specified */
|
|
char * env, /* Name of environment variable */
|
|
int pragma
|
|
/* 2: #pragma setlocale, 1: #pragma __setlocale, 0: not #pragma */
|
|
)
|
|
/*
|
|
* Search the encoding specified and re-initialize mbchar settings.
|
|
*/
|
|
{
|
|
const char * unknown_encoding
|
|
= "Unknown encoding: %s%.0ld%.0s"; /* _W1_ */
|
|
const char * too_long
|
|
= "Too long encoding name: %s%.0ld%.0s"; /* _E_ */
|
|
const char * loc = "";
|
|
int alias;
|
|
char norm[ NAMLEN];
|
|
/*
|
|
* Normalized name (removed 'xxxxx.', stripped '_', '-', '.'
|
|
* and lowered.
|
|
*/
|
|
|
|
if (strlen( name) >= NAMLEN) {
|
|
if ((env || pragma) && (warn_level & 1)) {
|
|
cwarn( too_long, name, 0L, NULL);
|
|
} else {
|
|
mcpp_fprintf( ERR, too_long, name);
|
|
mcpp_fputc( '\n', ERR);
|
|
}
|
|
}
|
|
strcpy( norm, name);
|
|
if (norm[ 5] == '.')
|
|
memmove( norm, norm + 5, strlen( norm + 5) + 1);
|
|
/* Remove initial 'xxxxx.' as 'ja_JP.', 'en_US.' or any other */
|
|
conv_case( norm, norm + strlen( norm), LOWER);
|
|
strip_bar( norm);
|
|
|
|
if (strlen( name) == 0) { /* "" */
|
|
mbchar = MBCHAR; /* Restore to the default encoding */
|
|
} else if (memcmp( norm, "iso8859", 7) == 0 /* iso8859* */
|
|
|| memcmp( norm, "latin", 5) == 0 /* latin* */
|
|
|| memcmp( norm, "en", 2) == 0) { /* en* */
|
|
mbchar = 0; /* No multi-byte character */
|
|
} else {
|
|
alias = 2;
|
|
#if COMPILER == MSC
|
|
if (pragma == SETLOCALE) /* #pragma setlocale */
|
|
alias = 0;
|
|
#endif
|
|
loc = search_encoding( norm, alias); /* Search the name */
|
|
}
|
|
if (loc == NULL) {
|
|
if ((env || pragma) && (warn_level & 1)) {
|
|
cwarn( unknown_encoding, name, 0L, NULL);
|
|
} else { /* -m option */
|
|
mcpp_fprintf( ERR, unknown_encoding, name);
|
|
mcpp_fputc( '\n', ERR);
|
|
}
|
|
} else {
|
|
mb_init(); /* Re-initialize */
|
|
}
|
|
return loc;
|
|
}
|
|
|
|
static const char * search_encoding(
|
|
char * norm, /* The name of encoding specified */
|
|
int alias /* The number of alias to start searching */
|
|
)
|
|
{
|
|
const char * loc;
|
|
int lo, al;
|
|
|
|
for (lo = 0; lo < NUM_ENCODING; lo++) {
|
|
for (al = alias ; al < NUM_ALIAS; al++) {
|
|
loc = encoding_name[ lo][ al];
|
|
if (str_eq( loc, norm)) {
|
|
switch (lo) {
|
|
case 0 : mbchar = 0; break;
|
|
case 1 : mbchar = EUC_JP; break;
|
|
case 2 : mbchar = GB2312; break;
|
|
case 3 : mbchar = KSC5601; break;
|
|
case 4 : mbchar = SJIS; break;
|
|
case 5 : mbchar = BIGFIVE; break;
|
|
case 6 : mbchar = ISO2022_JP; break;
|
|
case 7 : mbchar = UTF8; break;
|
|
}
|
|
return loc;
|
|
}
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
static void strip_bar(
|
|
char * string
|
|
)
|
|
/*
|
|
* Strip '_', '-' or '.' in the string.
|
|
*/
|
|
{
|
|
char * cp = string;
|
|
|
|
while (*cp != EOS) {
|
|
if (*cp == '_' || *cp == '-' || *cp == '.')
|
|
memmove( cp, cp + 1, strlen( cp));
|
|
else
|
|
cp++;
|
|
}
|
|
}
|
|
|
|
static void conv_case(
|
|
char * name, /* (diretory) Name */
|
|
char * lim, /* End of (directory) name */
|
|
int upper /* TRUE if to upper */
|
|
)
|
|
/* Convert a string to upper-case letters or lower-case letters in-place */
|
|
{
|
|
int c;
|
|
char * sp;
|
|
|
|
for (sp = name; sp < lim; sp++) {
|
|
c = *sp & UCHARMAX;
|
|
#if MBCHAR
|
|
if ((char_type[ c] & mbstart)) {
|
|
char tmp[ PATHMAX+1];
|
|
char * tp = tmp;
|
|
*tp++ = *sp++;
|
|
mb_read( c, &sp, &tp);
|
|
} else
|
|
#endif
|
|
{
|
|
if (upper)
|
|
*sp = toupper( c);
|
|
else
|
|
*sp = tolower( c);
|
|
}
|
|
}
|
|
}
|
|
|
|
void mb_init( void)
|
|
/*
|
|
* Initialize multi-byte character settings.
|
|
* First called prior to setting the 'mcpp_mode'.
|
|
* Will be called again each time the multibyte character encoding is changed.
|
|
*/
|
|
{
|
|
/*
|
|
* Select the character classification table, select the multi-byte
|
|
* character reading routine and decide whether multi-byte character
|
|
* may contain the byte of value 0x5c.
|
|
*/
|
|
switch (mbchar) {
|
|
case 0 :
|
|
case EUC_JP :
|
|
case GB2312 :
|
|
case KSC5601 :
|
|
char_type = type_euc;
|
|
bsl_in_mbchar = FALSE;
|
|
mb_read = mb_read_2byte;
|
|
break;
|
|
case SJIS :
|
|
case BIGFIVE :
|
|
char_type = type_bsl;
|
|
bsl_in_mbchar = TRUE;
|
|
mb_read = mb_read_2byte;
|
|
break;
|
|
case ISO2022_JP :
|
|
char_type = type_iso2022_jp;
|
|
bsl_in_mbchar = TRUE;
|
|
mb_read = mb_read_iso2022_jp;
|
|
break;
|
|
case UTF8 :
|
|
char_type = type_utf8;
|
|
bsl_in_mbchar = FALSE;
|
|
mb_read = mb_read_utf8;
|
|
break;
|
|
}
|
|
|
|
/* Set the bit patterns for character classification. */
|
|
switch (mbchar) {
|
|
case 0 :
|
|
mbstart = 0;
|
|
break;
|
|
case EUC_JP :
|
|
mbstart = EJ1;
|
|
mb2 = EJ2;
|
|
break;
|
|
case GB2312 :
|
|
mbstart = GB1;
|
|
mb2 = GB2;
|
|
break;
|
|
case KSC5601:
|
|
mbstart = KS1;
|
|
mb2 = KS2;
|
|
break;
|
|
case SJIS :
|
|
mbstart = SJ1;
|
|
mb2 = SJ2;
|
|
break;
|
|
case BIGFIVE:
|
|
mbstart = BF1;
|
|
mb2 = BF2;
|
|
break;
|
|
case ISO2022_JP :
|
|
mbstart = IS1;
|
|
break;
|
|
case UTF8 :
|
|
mbstart = (U2_1 | U3_1 | U4_1);
|
|
break;
|
|
}
|
|
switch (mbchar) {
|
|
case 0 :
|
|
mbchk = 0;
|
|
break;
|
|
case EUC_JP :
|
|
case GB2312 :
|
|
case KSC5601:
|
|
case SJIS :
|
|
case BIGFIVE:
|
|
case UTF8 :
|
|
mbchk = NA;
|
|
break;
|
|
case ISO2022_JP :
|
|
mbchk = (IS1 | NA);
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* Set special handling for some encodings to supplement some compiler's
|
|
* deficiency.
|
|
*/
|
|
switch (mbchar) {
|
|
case SJIS :
|
|
#if ! SJIS_IS_ESCAPE_FREE
|
|
bsl_need_escape = TRUE;
|
|
#endif
|
|
break;
|
|
case BIGFIVE:
|
|
#if ! BIGFIVE_IS_ESCAPE_FREE
|
|
bsl_need_escape = TRUE;
|
|
#endif
|
|
break;
|
|
case ISO2022_JP :
|
|
#if ! ISO2022_JP_IS_ESCAPE_FREE
|
|
bsl_need_escape = TRUE;
|
|
#endif
|
|
break;
|
|
default :
|
|
bsl_need_escape = FALSE;
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* Modify magic characters in character type table.
|
|
* char_type[] table should be rewritten in accordance with the 'mcpp_mode'
|
|
* whenever the encoding is changed.
|
|
*/
|
|
if (mcpp_mode) { /* If mcpp_mode is already set */
|
|
char_type[ DEF_MAGIC] = standard ? LET : 0;
|
|
char_type[ IN_SRC] = (mcpp_mode == STD) ? LET : 0;
|
|
char_type[ TOK_SEP] = (mcpp_mode == STD || mcpp_mode == OLD_PREP)
|
|
? HSPA: 0; /* TOK_SEP equals to COM_SEP */
|
|
}
|
|
}
|
|
|
|
static size_t mb_read_2byte(
|
|
int c1, /* The 1st byte of mbchar sequence (already read) */
|
|
char ** in_pp, /* Pointer to input */
|
|
char ** out_pp /* Pointer to output */
|
|
)
|
|
/*
|
|
* Multi-byte character reading routine for 2-byte encodings.
|
|
*/
|
|
{
|
|
int error = FALSE;
|
|
size_t len = 0; /* Number of multi-byte characters read. */
|
|
char * in_p = *in_pp;
|
|
char * out_p = *out_pp;
|
|
|
|
if (! (char_type[ c1 & UCHARMAX] & mbstart))
|
|
return MB_ERROR; /* Not a multi-byte character */
|
|
|
|
do {
|
|
if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mb2)) {
|
|
error = TRUE;
|
|
break;
|
|
}
|
|
len++;
|
|
} while (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mbstart);
|
|
*in_pp = --in_p;
|
|
*(--out_p) = EOS;
|
|
*out_pp = out_p;
|
|
return error ? (len | MB_ERROR) : len;
|
|
}
|
|
|
|
static size_t mb_read_iso2022_jp(
|
|
int c1, /* The 1st byte of the sequence already read (always 0x1b). */
|
|
char ** in_pp,
|
|
char ** out_pp
|
|
)
|
|
/*
|
|
* Multi-byte character reading routine for ISO2022_JP.
|
|
*/
|
|
{
|
|
int error = FALSE;
|
|
size_t len = 0;
|
|
char * in_p = *in_pp;
|
|
char * out_p = *out_pp;
|
|
int c2, c3, c4;
|
|
|
|
if (! (char_type[ c1 & UCHARMAX] & mbstart))
|
|
return MB_ERROR;
|
|
|
|
do {
|
|
|
|
*out_p++ = c2 = *in_p++;
|
|
if (! (char_type[ c2 & UCHARMAX] & IS2)) {
|
|
error = TRUE;
|
|
break;
|
|
}
|
|
*out_p++ = c3 = *in_p++;
|
|
if (! (char_type[ c3 & UCHARMAX] & IS3)) {
|
|
error = TRUE;
|
|
break;
|
|
}
|
|
|
|
switch (c2) {
|
|
case 0x24 :
|
|
switch (c3) {
|
|
case 0x42 : /* 0x1b 0x24 0x42: JIS X 0208-1983 */
|
|
break;
|
|
case 0x28 :
|
|
*out_p++ = c4 = *in_p++;
|
|
if (! (char_type[ c4 & UCHARMAX] & IS4))
|
|
error = TRUE;
|
|
/* else: 0x1b 0x24 0x28 0x44: JIS X 0212 */
|
|
break;
|
|
default :
|
|
error = TRUE;
|
|
}
|
|
break;
|
|
case 0x28 :
|
|
switch (c3) {
|
|
case 0x42 : /* 0x1b 0x28 0x42: ASCII */
|
|
c1 = *out_p++ = *in_p++ & UCHARMAX;
|
|
continue;
|
|
default :
|
|
error = TRUE;
|
|
}
|
|
break;
|
|
}
|
|
if (error)
|
|
break;
|
|
|
|
while (char_type[ c1 = *out_p++ = (*in_p++ & UCHARMAX)] & IJP) {
|
|
if (! (char_type[ *out_p++ = (*in_p++ & UCHARMAX)] & IJP)) {
|
|
error = TRUE;
|
|
break;
|
|
}
|
|
len++; /* String of multi-byte characters */
|
|
}
|
|
if (error)
|
|
break;
|
|
|
|
} while (char_type[ c1] & IS1); /* 0x1b: start of shift-sequence */
|
|
|
|
*in_pp = --in_p;
|
|
*(--out_p) = EOS;
|
|
*out_pp = out_p;
|
|
return error ? (len | MB_ERROR) : len;
|
|
}
|
|
|
|
static size_t mb_read_utf8(
|
|
int c1,
|
|
char ** in_pp,
|
|
char ** out_pp
|
|
)
|
|
/*
|
|
* Multi-byte character reading routine for UTF8.
|
|
*/
|
|
{
|
|
int error = FALSE;
|
|
size_t len = 0;
|
|
char * in_p = *in_pp;
|
|
char * out_p = *out_pp;
|
|
|
|
if (! (char_type[ c1 & UCHARMAX] & mbstart))
|
|
return MB_ERROR;
|
|
|
|
do {
|
|
unsigned int codepoint;
|
|
int i, bytes;
|
|
|
|
if ((char_type[ c1 & UCHARMAX] & U4_1) == U4_1)
|
|
bytes = 4; /* 4-byte character */
|
|
else if ((char_type[ c1 & UCHARMAX] & U3_1) == U3_1)
|
|
bytes = 3; /* 3-byte character */
|
|
else if ((char_type[ c1 & UCHARMAX] & U2_1) == U2_1)
|
|
bytes = 2; /* 2-byte character */
|
|
|
|
/* Must ensure that the sequence is not reserved as a surrogate */
|
|
codepoint = ((2 << (6-bytes)) - 1) & c1; /* mask off top bits */
|
|
|
|
/* All bytes left in the sequence must be in 0x80 - 0xBF */
|
|
for (i = bytes - 1; i && !error; i--) {
|
|
codepoint = (codepoint << 6) + ((*in_p) & 0x3fU);
|
|
if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & UCONT))
|
|
error = TRUE;
|
|
}
|
|
|
|
/* Check for overlong/underlong sequences */
|
|
if ((bytes == 2 && (codepoint < 0x80 || codepoint > 0x7FF))
|
|
|| (bytes == 3 && (codepoint < 0x800 || codepoint > 0xFFFF))
|
|
|| (bytes == 4 && (codepoint < 0x10000 || codepoint > 0x10FFFF)))
|
|
error = TRUE;
|
|
if ((codepoint >= 0xD800 && codepoint <= 0xDFFF)
|
|
/* Check for reserved surrogate codepoints */
|
|
|| (codepoint >= 0xFFFE && codepoint <= 0xFFFF))
|
|
/* Illegal */
|
|
error = TRUE;
|
|
#if 0
|
|
printf( "codepoint:0x%x\n", codepoint);
|
|
#endif
|
|
if (error)
|
|
break;
|
|
len++;
|
|
} while (char_type[ (*out_p++ = c1 = *in_p++) & UCHARMAX] & mbstart);
|
|
/* Start of the next multi-byte character */
|
|
*in_pp = --in_p;
|
|
*(--out_p) = EOS;
|
|
*out_pp = out_p;
|
|
return error ? (len | MB_ERROR) : len;
|
|
}
|
|
|
|
uexpr_t mb_eval(
|
|
char ** seq_pp
|
|
)
|
|
/*
|
|
* Evaluate the value of a multi-byte character.
|
|
* This routine does not check the legality of the sequence.
|
|
* This routine is called from eval_char().
|
|
* This routine is never called in POST_STD mode.
|
|
*/
|
|
{
|
|
char * seq = *seq_pp;
|
|
uexpr_t val = 0;
|
|
int c, c1;
|
|
|
|
if (! (char_type[ c = *seq++ & UCHARMAX] & mbstart)) {
|
|
*seq_pp = seq;
|
|
return c; /* Not a multi-byte character */
|
|
}
|
|
|
|
switch (mbchar) {
|
|
case EUC_JP :
|
|
case GB2312 :
|
|
case KSC5601:
|
|
case SJIS :
|
|
case BIGFIVE:
|
|
val = (c << 8) + (*seq++ & UCHARMAX);
|
|
/* Evaluate the 2-byte sequence */
|
|
break;
|
|
case ISO2022_JP :
|
|
if (char_type[ c & UCHARMAX] & IS1) { /* Skip shift-sequence */
|
|
if (char_type[ c = *seq++ & UCHARMAX] & IS2) {
|
|
if (char_type[ c1 = *seq++ & UCHARMAX] & IS3) {
|
|
if (c1 == 0x28)
|
|
seq++;
|
|
if (c == 0x28 && c1 == 0x42) { /* Shift-out sequence */
|
|
val = 0;
|
|
break;
|
|
}
|
|
c = *seq++ & UCHARMAX;
|
|
}
|
|
}
|
|
}
|
|
val = (c << 8) + (*seq++ & UCHARMAX); /* Evaluate the 2-bytes */
|
|
break;
|
|
case UTF8 : /* Evaluate the sequence of 2, 3 or 4 bytes as it is */
|
|
val = (c << 8) + (*seq++ & UCHARMAX);
|
|
if (char_type[ c & UCHARMAX] & U3_1) {
|
|
val = (val << 8) + (*seq++ & UCHARMAX);
|
|
} else if (char_type[ c & UCHARMAX] & U4_1) {
|
|
val = (val << 8) + (*seq++ & UCHARMAX);
|
|
val = (val << 8) + (*seq++ & UCHARMAX);
|
|
}
|
|
break;
|
|
}
|
|
|
|
*seq_pp = seq;
|
|
return val;
|
|
}
|
|
|
|
int last_is_mbchar(
|
|
const char * in, /* Input physical line */
|
|
int len /* Length of the line minus 2 */
|
|
)
|
|
/*
|
|
* Return 2, if the last char of the line is second byte of SJIS or BIGFIVE,
|
|
* else return 0.
|
|
*/
|
|
{
|
|
const char * cp = in + len;
|
|
const char * const endp = in + len; /* -> the char befor '\n' */
|
|
|
|
if ((mbchar & (SJIS | BIGFIVE)) == 0)
|
|
return 0;
|
|
while (in <= --cp) { /* Search backwardly */
|
|
if ((char_type[ *cp & UCHARMAX] & mbstart) == 0)
|
|
break; /* Not the first byte of MBCHAR */
|
|
}
|
|
if ((endp - cp) & 1)
|
|
return 0;
|
|
else
|
|
return 2;
|
|
}
|
|
|