555 lines
26 KiB
Groff
555 lines
26 KiB
Groff
'\"
|
|
'\" Copyright (c) 1997-1998 Sun Microsystems, Inc.
|
|
'\"
|
|
'\" See the file "license.terms" for information on usage and redistribution
|
|
'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
|
|
'\"
|
|
.TH Tcl_GetEncoding 3 "8.1" Tcl "Tcl Library Procedures"
|
|
.so man.macros
|
|
.BS
|
|
.SH NAME
|
|
Tcl_GetEncoding, Tcl_FreeEncoding, Tcl_GetEncodingFromObj, Tcl_ExternalToUtfDString, Tcl_ExternalToUtf, Tcl_UtfToExternalDString, Tcl_UtfToExternal, Tcl_WinTCharToUtf, Tcl_WinUtfToTChar, Tcl_GetEncodingName, Tcl_SetSystemEncoding, Tcl_GetEncodingNameFromEnvironment, Tcl_GetEncodingNames, Tcl_CreateEncoding, Tcl_GetEncodingSearchPath, Tcl_SetEncodingSearchPath, Tcl_GetDefaultEncodingDir, Tcl_SetDefaultEncodingDir \- procedures for creating and using encodings
|
|
.SH SYNOPSIS
|
|
.nf
|
|
\fB#include <tcl.h>\fR
|
|
.sp
|
|
Tcl_Encoding
|
|
\fBTcl_GetEncoding\fR(\fIinterp, name\fR)
|
|
.sp
|
|
void
|
|
\fBTcl_FreeEncoding\fR(\fIencoding\fR)
|
|
.sp
|
|
int
|
|
\fBTcl_GetEncodingFromObj\fR(\fIinterp, objPtr, encodingPtr\fR)
|
|
.sp
|
|
char *
|
|
\fBTcl_ExternalToUtfDString\fR(\fIencoding, src, srcLen, dstPtr\fR)
|
|
.sp
|
|
char *
|
|
\fBTcl_UtfToExternalDString\fR(\fIencoding, src, srcLen, dstPtr\fR)
|
|
.sp
|
|
int
|
|
\fBTcl_ExternalToUtf\fR(\fIinterp, encoding, src, srcLen, flags, statePtr,
|
|
dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr\fR)
|
|
.sp
|
|
int
|
|
\fBTcl_UtfToExternal\fR(\fIinterp, encoding, src, srcLen, flags, statePtr,
|
|
dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr\fR)
|
|
.sp
|
|
char *
|
|
\fBTcl_WinTCharToUtf\fR(\fItsrc, srcLen, dstPtr\fR)
|
|
.sp
|
|
TCHAR *
|
|
\fBTcl_WinUtfToTChar\fR(\fIsrc, srcLen, dstPtr\fR)
|
|
.sp
|
|
const char *
|
|
\fBTcl_GetEncodingName\fR(\fIencoding\fR)
|
|
.sp
|
|
int
|
|
\fBTcl_SetSystemEncoding\fR(\fIinterp, name\fR)
|
|
.sp
|
|
const char *
|
|
\fBTcl_GetEncodingNameFromEnvironment\fR(\fIbufPtr\fR)
|
|
.sp
|
|
void
|
|
\fBTcl_GetEncodingNames\fR(\fIinterp\fR)
|
|
.sp
|
|
Tcl_Encoding
|
|
\fBTcl_CreateEncoding\fR(\fItypePtr\fR)
|
|
.sp
|
|
Tcl_Obj *
|
|
\fBTcl_GetEncodingSearchPath\fR()
|
|
.sp
|
|
int
|
|
\fBTcl_SetEncodingSearchPath\fR(\fIsearchPath\fR)
|
|
.sp
|
|
const char *
|
|
\fBTcl_GetDefaultEncodingDir\fR(\fIvoid\fR)
|
|
.sp
|
|
void
|
|
\fBTcl_SetDefaultEncodingDir\fR(\fIpath\fR)
|
|
.SH ARGUMENTS
|
|
.AS "const Tcl_EncodingType" *dstWrotePtr in/out
|
|
.AP Tcl_Interp *interp in
|
|
Interpreter to use for error reporting, or NULL if no error reporting is
|
|
desired.
|
|
.AP "const char" *name in
|
|
Name of encoding to load.
|
|
.AP Tcl_Encoding encoding in
|
|
The encoding to query, free, or use for converting text. If \fIencoding\fR is
|
|
NULL, the current system encoding is used.
|
|
.AP Tcl_Obj *objPtr in
|
|
Name of encoding to get token for.
|
|
.AP Tcl_Encoding *encodingPtr out
|
|
Points to storage where encoding token is to be written.
|
|
.AP "const char" *src in
|
|
For the \fBTcl_ExternalToUtf\fR functions, an array of bytes in the
|
|
specified encoding that are to be converted to UTF-8. For the
|
|
\fBTcl_UtfToExternal\fR and \fBTcl_WinUtfToTChar\fR functions, an array of
|
|
UTF-8 characters to be converted to the specified encoding.
|
|
.AP "const TCHAR" *tsrc in
|
|
An array of Windows TCHAR characters to convert to UTF-8.
|
|
.AP int srcLen in
|
|
Length of \fIsrc\fR or \fItsrc\fR in bytes. If the length is negative, the
|
|
encoding-specific length of the string is used.
|
|
.AP Tcl_DString *dstPtr out
|
|
Pointer to an uninitialized or free \fBTcl_DString\fR in which the converted
|
|
result will be stored.
|
|
.AP int flags in
|
|
Various flag bits OR-ed together.
|
|
\fBTCL_ENCODING_START\fR signifies that the
|
|
source buffer is the first block in a (potentially multi-block) input
|
|
stream, telling the conversion routine to reset to an initial state and
|
|
perform any initialization that needs to occur before the first byte is
|
|
converted. \fBTCL_ENCODING_END\fR signifies that the source buffer is the last
|
|
block in a (potentially multi-block) input stream, telling the conversion
|
|
routine to perform any finalization that needs to occur after the last
|
|
byte is converted and then to reset to an initial state.
|
|
\fBTCL_ENCODING_STOPONERROR\fR signifies that the conversion routine should
|
|
return immediately upon reading a source character that does not exist in
|
|
the target encoding; otherwise a default fallback character will
|
|
automatically be substituted.
|
|
.AP Tcl_EncodingState *statePtr in/out
|
|
Used when converting a (generally long or indefinite length) byte stream
|
|
in a piece-by-piece fashion. The conversion routine stores its current
|
|
state in \fI*statePtr\fR after \fIsrc\fR (the buffer containing the
|
|
current piece) has been converted; that state information must be passed
|
|
back when converting the next piece of the stream so the conversion
|
|
routine knows what state it was in when it left off at the end of the
|
|
last piece. May be NULL, in which case the value specified for \fIflags\fR
|
|
is ignored and the source buffer is assumed to contain the complete string to
|
|
convert.
|
|
.AP char *dst out
|
|
Buffer in which the converted result will be stored. No more than
|
|
\fIdstLen\fR bytes will be stored in \fIdst\fR.
|
|
.AP int dstLen in
|
|
The maximum length of the output buffer \fIdst\fR in bytes.
|
|
.AP int *srcReadPtr out
|
|
Filled with the number of bytes from \fIsrc\fR that were actually
|
|
converted. This may be less than the original source length if there was
|
|
a problem converting some source characters. May be NULL.
|
|
.AP int *dstWrotePtr out
|
|
Filled with the number of bytes that were actually stored in the output
|
|
buffer as a result of the conversion. May be NULL.
|
|
.AP int *dstCharsPtr out
|
|
Filled with the number of characters that correspond to the number of bytes
|
|
stored in the output buffer. May be NULL.
|
|
.AP Tcl_DString *bufPtr out
|
|
Storage for the prescribed system encoding name.
|
|
.AP "const Tcl_EncodingType" *typePtr in
|
|
Structure that defines a new type of encoding.
|
|
.AP Tcl_Obj *searchPath in
|
|
List of filesystem directories in which to search for encoding data files.
|
|
.AP "const char" *path in
|
|
A path to the location of the encoding file.
|
|
.BE
|
|
.SH INTRODUCTION
|
|
.PP
|
|
These routines convert between Tcl's internal character representation,
|
|
UTF-8, and character representations used by various operating systems or
|
|
file systems, such as Unicode, ASCII, or Shift-JIS. When operating on
|
|
strings, such as such as obtaining the names of files or displaying
|
|
characters using international fonts, the strings must be translated into
|
|
one or possibly multiple formats that the various system calls can use. For
|
|
instance, on a Japanese Unix workstation, a user might obtain a filename
|
|
represented in the EUC-JP file encoding and then translate the characters to
|
|
the jisx0208 font encoding in order to display the filename in a Tk widget.
|
|
The purpose of the encoding package is to help bridge the translation gap.
|
|
UTF-8 provides an intermediate staging ground for all the various
|
|
encodings. In the example above, text would be translated into UTF-8 from
|
|
whatever file encoding the operating system is using. Then it would be
|
|
translated from UTF-8 into whatever font encoding the display routines
|
|
require.
|
|
.PP
|
|
Some basic encodings are compiled into Tcl. Others can be defined by the
|
|
user or dynamically loaded from encoding files in a
|
|
platform-independent manner.
|
|
.SH DESCRIPTION
|
|
.PP
|
|
\fBTcl_GetEncoding\fR finds an encoding given its \fIname\fR. The name may
|
|
refer to a built-in Tcl encoding, a user-defined encoding registered by
|
|
calling \fBTcl_CreateEncoding\fR, or a dynamically-loadable encoding
|
|
file. The return value is a token that represents the encoding and can be
|
|
used in subsequent calls to procedures such as \fBTcl_GetEncodingName\fR,
|
|
\fBTcl_FreeEncoding\fR, and \fBTcl_UtfToExternal\fR. If the name did not
|
|
refer to any known or loadable encoding, NULL is returned and an error
|
|
message is returned in \fIinterp\fR.
|
|
.PP
|
|
The encoding package maintains a database of all encodings currently in use.
|
|
The first time \fIname\fR is seen, \fBTcl_GetEncoding\fR returns an
|
|
encoding with a reference count of 1. If the same \fIname\fR is requested
|
|
further times, then the reference count for that encoding is incremented
|
|
without the overhead of allocating a new encoding and all its associated
|
|
data structures.
|
|
.PP
|
|
When an \fIencoding\fR is no longer needed, \fBTcl_FreeEncoding\fR
|
|
should be called to release it. When an \fIencoding\fR is no longer in use
|
|
anywhere (i.e., it has been freed as many times as it has been gotten)
|
|
\fBTcl_FreeEncoding\fR will release all storage the encoding was using
|
|
and delete it from the database.
|
|
.PP
|
|
\fBTcl_GetEncodingFromObj\fR treats the string representation of
|
|
\fIobjPtr\fR as an encoding name, and finds an encoding with that
|
|
name, just as \fBTcl_GetEncoding\fR does. When an encoding is found,
|
|
it is cached within the \fBobjPtr\fR value for future reference, the
|
|
\fBTcl_Encoding\fR token is written to the storage pointed to by
|
|
\fIencodingPtr\fR, and the value \fBTCL_OK\fR is returned. If no such
|
|
encoding is found, the value \fBTCL_ERROR\fR is returned, and no
|
|
writing to \fB*\fR\fIencodingPtr\fR takes place. Just as with
|
|
\fBTcl_GetEncoding\fR, the caller should call \fBTcl_FreeEncoding\fR
|
|
on the resulting encoding token when that token will no longer be
|
|
used.
|
|
.PP
|
|
\fBTcl_ExternalToUtfDString\fR converts a source buffer \fIsrc\fR from the
|
|
specified \fIencoding\fR into UTF-8. The converted bytes are stored in
|
|
\fIdstPtr\fR, which is then null-terminated. The caller should eventually
|
|
call \fBTcl_DStringFree\fR to free any information stored in \fIdstPtr\fR.
|
|
When converting, if any of the characters in the source buffer cannot be
|
|
represented in the target encoding, a default fallback character will be
|
|
used. The return value is a pointer to the value stored in the DString.
|
|
.PP
|
|
\fBTcl_ExternalToUtf\fR converts a source buffer \fIsrc\fR from the specified
|
|
\fIencoding\fR into UTF-8. Up to \fIsrcLen\fR bytes are converted from the
|
|
source buffer and up to \fIdstLen\fR converted bytes are stored in \fIdst\fR.
|
|
In all cases, \fI*srcReadPtr\fR is filled with the number of bytes that were
|
|
successfully converted from \fIsrc\fR and \fI*dstWrotePtr\fR is filled with
|
|
the corresponding number of bytes that were stored in \fIdst\fR. The return
|
|
value is one of the following:
|
|
.RS
|
|
.IP \fBTCL_OK\fR 29
|
|
All bytes of \fIsrc\fR were converted.
|
|
.IP \fBTCL_CONVERT_NOSPACE\fR 29
|
|
The destination buffer was not large enough for all of the converted data; as
|
|
many characters as could fit were converted though.
|
|
.IP \fBTCL_CONVERT_MULTIBYTE\fR 29
|
|
The last few bytes in the source buffer were the beginning of a multibyte
|
|
sequence, but more bytes were needed to complete this sequence. A
|
|
subsequent call to the conversion routine should pass a buffer containing
|
|
the unconverted bytes that remained in \fIsrc\fR plus some further bytes
|
|
from the source stream to properly convert the formerly split-up multibyte
|
|
sequence.
|
|
.IP \fBTCL_CONVERT_SYNTAX\fR 29
|
|
The source buffer contained an invalid character sequence. This may occur
|
|
if the input stream has been damaged or if the input encoding method was
|
|
misidentified.
|
|
.IP \fBTCL_CONVERT_UNKNOWN\fR 29
|
|
The source buffer contained a character that could not be represented in
|
|
the target encoding and \fBTCL_ENCODING_STOPONERROR\fR was specified.
|
|
.RE
|
|
.LP
|
|
\fBTcl_UtfToExternalDString\fR converts a source buffer \fIsrc\fR from UTF-8
|
|
into the specified \fIencoding\fR. The converted bytes are stored in
|
|
\fIdstPtr\fR, which is then terminated with the appropriate encoding-specific
|
|
null. The caller should eventually call \fBTcl_DStringFree\fR to free any
|
|
information stored in \fIdstPtr\fR. When converting, if any of the
|
|
characters in the source buffer cannot be represented in the target
|
|
encoding, a default fallback character will be used. The return value is
|
|
a pointer to the value stored in the DString.
|
|
.PP
|
|
\fBTcl_UtfToExternal\fR converts a source buffer \fIsrc\fR from UTF-8 into
|
|
the specified \fIencoding\fR. Up to \fIsrcLen\fR bytes are converted from
|
|
the source buffer and up to \fIdstLen\fR converted bytes are stored in
|
|
\fIdst\fR. In all cases, \fI*srcReadPtr\fR is filled with the number of
|
|
bytes that were successfully converted from \fIsrc\fR and \fI*dstWrotePtr\fR
|
|
is filled with the corresponding number of bytes that were stored in
|
|
\fIdst\fR. The return values are the same as the return values for
|
|
\fBTcl_ExternalToUtf\fR.
|
|
.PP
|
|
\fBTcl_WinUtfToTChar\fR and \fBTcl_WinTCharToUtf\fR are
|
|
Windows-only convenience
|
|
functions for converting between UTF-8 and Windows strings
|
|
based on the TCHAR type which is by convention
|
|
a Unicode character on Windows NT.
|
|
.PP
|
|
\fBTcl_GetEncodingName\fR is roughly the inverse of \fBTcl_GetEncoding\fR.
|
|
Given an \fIencoding\fR, the return value is the \fIname\fR argument that
|
|
was used to create the encoding. The string returned by
|
|
\fBTcl_GetEncodingName\fR is only guaranteed to persist until the
|
|
\fIencoding\fR is deleted. The caller must not modify this string.
|
|
.PP
|
|
\fBTcl_SetSystemEncoding\fR sets the default encoding that should be used
|
|
whenever the user passes a NULL value for the \fIencoding\fR argument to
|
|
any of the other encoding functions. If \fIname\fR is NULL, the system
|
|
encoding is reset to the default system encoding, \fBbinary\fR. If the
|
|
name did not refer to any known or loadable encoding, \fBTCL_ERROR\fR is
|
|
returned and an error message is left in \fIinterp\fR. Otherwise, this
|
|
procedure increments the reference count of the new system encoding,
|
|
decrements the reference count of the old system encoding, and returns
|
|
\fBTCL_OK\fR.
|
|
.PP
|
|
\fBTcl_GetEncodingNameFromEnvironment\fR provides a means for the Tcl
|
|
library to report the encoding name it believes to be the correct one
|
|
to use as the system encoding, based on system calls and examination of
|
|
the environment suitable for the platform. It accepts \fIbufPtr\fR,
|
|
a pointer to an uninitialized or freed \fBTcl_DString\fR and writes
|
|
the encoding name to it. The \fBTcl_DStringValue\fR is returned.
|
|
.PP
|
|
\fBTcl_GetEncodingNames\fR sets the \fIinterp\fR result to a list
|
|
consisting of the names of all the encodings that are currently defined
|
|
or can be dynamically loaded, searching the encoding path specified by
|
|
\fBTcl_SetDefaultEncodingDir\fR. This procedure does not ensure that the
|
|
dynamically-loadable encoding files contain valid data, but merely that they
|
|
exist.
|
|
.PP
|
|
\fBTcl_CreateEncoding\fR defines a new encoding and registers the C
|
|
procedures that are called back to convert between the encoding and
|
|
UTF-8. Encodings created by \fBTcl_CreateEncoding\fR are thereafter
|
|
visible in the database used by \fBTcl_GetEncoding\fR. Just as with the
|
|
\fBTcl_GetEncoding\fR procedure, the return value is a token that
|
|
represents the encoding and can be used in subsequent calls to other
|
|
encoding functions. \fBTcl_CreateEncoding\fR returns an encoding with a
|
|
reference count of 1. If an encoding with the specified \fIname\fR
|
|
already exists, then its entry in the database is replaced with the new
|
|
encoding; the token for the old encoding will remain valid and continue
|
|
to behave as before, but users of the new token will now call the new
|
|
encoding procedures.
|
|
.PP
|
|
The \fItypePtr\fR argument to \fBTcl_CreateEncoding\fR contains information
|
|
about the name of the encoding and the procedures that will be called to
|
|
convert between this encoding and UTF-8. It is defined as follows:
|
|
.PP
|
|
.CS
|
|
typedef struct Tcl_EncodingType {
|
|
const char *\fIencodingName\fR;
|
|
Tcl_EncodingConvertProc *\fItoUtfProc\fR;
|
|
Tcl_EncodingConvertProc *\fIfromUtfProc\fR;
|
|
Tcl_EncodingFreeProc *\fIfreeProc\fR;
|
|
ClientData \fIclientData\fR;
|
|
int \fInullSize\fR;
|
|
} \fBTcl_EncodingType\fR;
|
|
.CE
|
|
.PP
|
|
The \fIencodingName\fR provides a string name for the encoding, by
|
|
which it can be referred in other procedures such as
|
|
\fBTcl_GetEncoding\fR. The \fItoUtfProc\fR refers to a callback
|
|
procedure to invoke to convert text from this encoding into UTF-8.
|
|
The \fIfromUtfProc\fR refers to a callback procedure to invoke to
|
|
convert text from UTF-8 into this encoding. The \fIfreeProc\fR refers
|
|
to a callback procedure to invoke when this encoding is deleted. The
|
|
\fIfreeProc\fR field may be NULL. The \fIclientData\fR contains an
|
|
arbitrary one-word value passed to \fItoUtfProc\fR, \fIfromUtfProc\fR,
|
|
and \fIfreeProc\fR whenever they are called. Typically, this is a
|
|
pointer to a data structure containing encoding-specific information
|
|
that can be used by the callback procedures. For instance, two very
|
|
similar encodings such as \fBascii\fR and \fBmacRoman\fR may use the
|
|
same callback procedure, but use different values of \fIclientData\fR
|
|
to control its behavior. The \fInullSize\fR specifies the number of
|
|
zero bytes that signify end-of-string in this encoding. It must be
|
|
\fB1\fR (for single-byte or multi-byte encodings like ASCII or
|
|
Shift-JIS) or \fB2\fR (for double-byte encodings like Unicode).
|
|
Constant-sized encodings with 3 or more bytes per character (such as
|
|
CNS11643) are not accepted.
|
|
.PP
|
|
The callback procedures \fItoUtfProc\fR and \fIfromUtfProc\fR should match the
|
|
type \fBTcl_EncodingConvertProc\fR:
|
|
.PP
|
|
.CS
|
|
typedef int \fBTcl_EncodingConvertProc\fR(
|
|
ClientData \fIclientData\fR,
|
|
const char *\fIsrc\fR,
|
|
int \fIsrcLen\fR,
|
|
int \fIflags\fR,
|
|
Tcl_EncodingState *\fIstatePtr\fR,
|
|
char *\fIdst\fR,
|
|
int \fIdstLen\fR,
|
|
int *\fIsrcReadPtr\fR,
|
|
int *\fIdstWrotePtr\fR,
|
|
int *\fIdstCharsPtr\fR);
|
|
.CE
|
|
.PP
|
|
The \fItoUtfProc\fR and \fIfromUtfProc\fR procedures are called by the
|
|
\fBTcl_ExternalToUtf\fR or \fBTcl_UtfToExternal\fR family of functions to
|
|
perform the actual conversion. The \fIclientData\fR parameter to these
|
|
procedures is the same as the \fIclientData\fR field specified to
|
|
\fBTcl_CreateEncoding\fR when the encoding was created. The remaining
|
|
arguments to the callback procedures are the same as the arguments,
|
|
documented at the top, to \fBTcl_ExternalToUtf\fR or
|
|
\fBTcl_UtfToExternal\fR, with the following exceptions. If the
|
|
\fIsrcLen\fR argument to one of those high-level functions is negative,
|
|
the value passed to the callback procedure will be the appropriate
|
|
encoding-specific string length of \fIsrc\fR. If any of the \fIsrcReadPtr\fR,
|
|
\fIdstWrotePtr\fR, or \fIdstCharsPtr\fR arguments to one of the high-level
|
|
functions is NULL, the corresponding value passed to the callback
|
|
procedure will be a non-NULL location.
|
|
.PP
|
|
The callback procedure \fIfreeProc\fR, if non-NULL, should match the type
|
|
\fBTcl_EncodingFreeProc\fR:
|
|
.PP
|
|
.CS
|
|
typedef void \fBTcl_EncodingFreeProc\fR(
|
|
ClientData \fIclientData\fR);
|
|
.CE
|
|
.PP
|
|
This \fIfreeProc\fR function is called when the encoding is deleted. The
|
|
\fIclientData\fR parameter is the same as the \fIclientData\fR field
|
|
specified to \fBTcl_CreateEncoding\fR when the encoding was created.
|
|
.PP
|
|
\fBTcl_GetEncodingSearchPath\fR and \fBTcl_SetEncodingSearchPath\fR
|
|
are called to access and set the list of filesystem directories searched
|
|
for encoding data files.
|
|
.PP
|
|
The value returned by \fBTcl_GetEncodingSearchPath\fR
|
|
is the value stored by the last successful call to
|
|
\fBTcl_SetEncodingSearchPath\fR. If no calls to
|
|
\fBTcl_SetEncodingSearchPath\fR have occurred, Tcl will compute an initial
|
|
value based on the environment. There is one encoding search path for the
|
|
entire process, shared by all threads in the process.
|
|
.PP
|
|
\fBTcl_SetEncodingSearchPath\fR stores \fIsearchPath\fR and returns
|
|
\fBTCL_OK\fR, unless \fIsearchPath\fR is not a valid Tcl list, which
|
|
causes \fBTCL_ERROR\fR to be returned. The elements of \fIsearchPath\fR
|
|
are not verified as existing readable filesystem directories. When
|
|
searching for encoding data files takes place, and non-existent or
|
|
non-readable filesystem directories on the \fIsearchPath\fR are silently
|
|
ignored.
|
|
.PP
|
|
\fBTcl_GetDefaultEncodingDir\fR and \fBTcl_SetDefaultEncodingDir\fR
|
|
are obsolete interfaces best replaced with calls to
|
|
\fBTcl_GetEncodingSearchPath\fR and \fBTcl_SetEncodingSearchPath\fR.
|
|
They are called to access and set the first element of the \fIsearchPath\fR
|
|
list. Since Tcl searches \fIsearchPath\fR for encoding data files in
|
|
list order, these routines establish the
|
|
.QW default
|
|
directory in which to find encoding data files.
|
|
.SH "ENCODING FILES"
|
|
Space would prohibit precompiling into Tcl every possible encoding
|
|
algorithm, so many encodings are stored on disk as dynamically-loadable
|
|
encoding files. This behavior also allows the user to create additional
|
|
encoding files that can be loaded using the same mechanism. These
|
|
encoding files contain information about the tables and/or escape
|
|
sequences used to map between an external encoding and Unicode. The
|
|
external encoding may consist of single-byte, multi-byte, or double-byte
|
|
characters.
|
|
.PP
|
|
Each dynamically-loadable encoding is represented as a text file. The
|
|
initial line of the file, beginning with a
|
|
.QW #
|
|
symbol, is a comment
|
|
that provides a human-readable description of the file. The next line
|
|
identifies the type of encoding file. It can be one of the following
|
|
letters:
|
|
.IP "[1] \fBS\fR"
|
|
A single-byte encoding, where one character is always one byte long in the
|
|
encoding. An example is \fBiso8859-1\fR, used by many European languages.
|
|
.IP "[2] \fBD\fR"
|
|
A double-byte encoding, where one character is always two bytes long in the
|
|
encoding. An example is \fBbig5\fR, used for Chinese text.
|
|
.IP "[3] \fBM\fR"
|
|
A multi-byte encoding, where one character may be either one or two bytes long.
|
|
Certain bytes are lead bytes, indicating that another byte must follow
|
|
and that together the two bytes represent one character. Other bytes are not
|
|
lead bytes and represent themselves. An example is \fBshiftjis\fR, used by
|
|
many Japanese computers.
|
|
.IP "[4] \fBE\fR"
|
|
An escape-sequence encoding, specifying that certain sequences of bytes
|
|
do not represent characters, but commands that describe how following bytes
|
|
should be interpreted.
|
|
.PP
|
|
The rest of the lines in the file depend on the type.
|
|
.PP
|
|
Cases [1], [2], and [3] are collectively referred to as table-based encoding
|
|
files. The lines in a table-based encoding file are in the same
|
|
format as this example taken from the \fBshiftjis\fR encoding (this is not
|
|
the complete file):
|
|
.PP
|
|
.CS
|
|
# Encoding file: shiftjis, multi-byte
|
|
M
|
|
003F 0 40
|
|
00
|
|
0000000100020003000400050006000700080009000A000B000C000D000E000F
|
|
0010001100120013001400150016001700180019001A001B001C001D001E001F
|
|
0020002100220023002400250026002700280029002A002B002C002D002E002F
|
|
0030003100320033003400350036003700380039003A003B003C003D003E003F
|
|
0040004100420043004400450046004700480049004A004B004C004D004E004F
|
|
0050005100520053005400550056005700580059005A005B005C005D005E005F
|
|
0060006100620063006400650066006700680069006A006B006C006D006E006F
|
|
0070007100720073007400750076007700780079007A007B007C007D203E007F
|
|
0080000000000000000000000000000000000000000000000000000000000000
|
|
0000000000000000000000000000000000000000000000000000000000000000
|
|
0000FF61FF62FF63FF64FF65FF66FF67FF68FF69FF6AFF6BFF6CFF6DFF6EFF6F
|
|
FF70FF71FF72FF73FF74FF75FF76FF77FF78FF79FF7AFF7BFF7CFF7DFF7EFF7F
|
|
FF80FF81FF82FF83FF84FF85FF86FF87FF88FF89FF8AFF8BFF8CFF8DFF8EFF8F
|
|
FF90FF91FF92FF93FF94FF95FF96FF97FF98FF99FF9AFF9BFF9CFF9DFF9EFF9F
|
|
0000000000000000000000000000000000000000000000000000000000000000
|
|
0000000000000000000000000000000000000000000000000000000000000000
|
|
81
|
|
0000000000000000000000000000000000000000000000000000000000000000
|
|
0000000000000000000000000000000000000000000000000000000000000000
|
|
0000000000000000000000000000000000000000000000000000000000000000
|
|
0000000000000000000000000000000000000000000000000000000000000000
|
|
300030013002FF0CFF0E30FBFF1AFF1BFF1FFF01309B309C00B4FF4000A8FF3E
|
|
FFE3FF3F30FD30FE309D309E30034EDD30053006300730FC20152010FF0F005C
|
|
301C2016FF5C2026202520182019201C201DFF08FF0930143015FF3BFF3DFF5B
|
|
FF5D30083009300A300B300C300D300E300F30103011FF0B221200B100D70000
|
|
00F7FF1D2260FF1CFF1E22662267221E22342642264000B0203220332103FFE5
|
|
FF0400A200A3FF05FF03FF06FF0AFF2000A72606260525CB25CF25CE25C725C6
|
|
25A125A025B325B225BD25BC203B301221922190219121933013000000000000
|
|
000000000000000000000000000000002208220B2286228722822283222A2229
|
|
000000000000000000000000000000002227222800AC21D221D4220022030000
|
|
0000000000000000000000000000000000000000222022A52312220222072261
|
|
2252226A226B221A223D221D2235222B222C0000000000000000000000000000
|
|
212B2030266F266D266A2020202100B6000000000000000025EF000000000000
|
|
.CE
|
|
.PP
|
|
The third line of the file is three numbers. The first number is the
|
|
fallback character (in base 16) to use when converting from UTF-8 to this
|
|
encoding. The second number is a \fB1\fR if this file represents the
|
|
encoding for a symbol font, or \fB0\fR otherwise. The last number (in base
|
|
10) is how many pages of data follow.
|
|
.PP
|
|
Subsequent lines in the example above are pages that describe how to map
|
|
from the encoding into 2-byte Unicode. The first line in a page identifies
|
|
the page number. Following it are 256 double-byte numbers, arranged as 16
|
|
rows of 16 numbers. Given a character in the encoding, the high byte of
|
|
that character is used to select which page, and the low byte of that
|
|
character is used as an index to select one of the double-byte numbers in
|
|
that page \- the value obtained being the corresponding Unicode character.
|
|
By examination of the example above, one can see that the characters 0x7E
|
|
and 0x8163 in \fBshiftjis\fR map to 203E and 2026 in Unicode, respectively.
|
|
.PP
|
|
Following the first page will be all the other pages, each in the same
|
|
format as the first: one number identifying the page followed by 256
|
|
double-byte Unicode characters. If a character in the encoding maps to the
|
|
Unicode character 0000, it means that the character does not actually exist.
|
|
If all characters on a page would map to 0000, that page can be omitted.
|
|
.PP
|
|
Case [4] is the escape-sequence encoding file. The lines in an this type of
|
|
file are in the same format as this example taken from the \fBiso2022-jp\fR
|
|
encoding:
|
|
.PP
|
|
.CS
|
|
.ta 1.5i
|
|
# Encoding file: iso2022-jp, escape-driven
|
|
E
|
|
init {}
|
|
final {}
|
|
iso8859-1 \ex1b(B
|
|
jis0201 \ex1b(J
|
|
jis0208 \ex1b$@
|
|
jis0208 \ex1b$B
|
|
jis0212 \ex1b$(D
|
|
gb2312 \ex1b$A
|
|
ksc5601 \ex1b$(C
|
|
.CE
|
|
.PP
|
|
In the file, the first column represents an option and the second column
|
|
is the associated value. \fBinit\fR is a string to emit or expect before
|
|
the first character is converted, while \fBfinal\fR is a string to emit
|
|
or expect after the last character. All other options are names of
|
|
table-based encodings; the associated value is the escape-sequence that
|
|
marks that encoding. Tcl syntax is used for the values; in the above
|
|
example, for instance,
|
|
.QW \fB{}\fR
|
|
represents the empty string and
|
|
.QW \fB\ex1b\fR
|
|
represents character 27.
|
|
.PP
|
|
When \fBTcl_GetEncoding\fR encounters an encoding \fIname\fR that has not
|
|
been loaded, it attempts to load an encoding file called \fIname\fB.enc\fR
|
|
from the \fBencoding\fR subdirectory of each directory that Tcl searches
|
|
for its script library. If the encoding file exists, but is
|
|
malformed, an error message will be left in \fIinterp\fR.
|
|
.SH KEYWORDS
|
|
utf, encoding, convert
|