clang API Documentation
00001 /*===--- ConvertUTF.h - Universal Character Names conversions ---------------=== 00002 * 00003 * The LLVM Compiler Infrastructure 00004 * 00005 * This file is distributed under the University of Illinois Open Source 00006 * License. See LICENSE.TXT for details. 00007 * 00008 *==------------------------------------------------------------------------==*/ 00009 /* 00010 * Copyright 2001-2004 Unicode, Inc. 00011 * 00012 * Disclaimer 00013 * 00014 * This source code is provided as is by Unicode, Inc. No claims are 00015 * made as to fitness for any particular purpose. No warranties of any 00016 * kind are expressed or implied. The recipient agrees to determine 00017 * applicability of information provided. If this file has been 00018 * purchased on magnetic or optical media from Unicode, Inc., the 00019 * sole remedy for any claim will be exchange of defective media 00020 * within 90 days of receipt. 00021 * 00022 * Limitations on Rights to Redistribute This Code 00023 * 00024 * Unicode, Inc. hereby grants the right to freely use the information 00025 * supplied in this file in the creation of products supporting the 00026 * Unicode Standard, and to make copies of this file in any form 00027 * for internal or external distribution as long as this notice 00028 * remains attached. 00029 */ 00030 00031 /* --------------------------------------------------------------------- 00032 00033 Conversions between UTF32, UTF-16, and UTF-8. Header file. 00034 00035 Several funtions are included here, forming a complete set of 00036 conversions between the three formats. UTF-7 is not included 00037 here, but is handled in a separate source file. 00038 00039 Each of these routines takes pointers to input buffers and output 00040 buffers. The input buffers are const. 00041 00042 Each routine converts the text between *sourceStart and sourceEnd, 00043 putting the result into the buffer between *targetStart and 00044 targetEnd. Note: the end pointers are *after* the last item: e.g. 00045 *(sourceEnd - 1) is the last item. 00046 00047 The return result indicates whether the conversion was successful, 00048 and if not, whether the problem was in the source or target buffers. 00049 (Only the first encountered problem is indicated.) 00050 00051 After the conversion, *sourceStart and *targetStart are both 00052 updated to point to the end of last text successfully converted in 00053 the respective buffers. 00054 00055 Input parameters: 00056 sourceStart - pointer to a pointer to the source buffer. 00057 The contents of this are modified on return so that 00058 it points at the next thing to be converted. 00059 targetStart - similarly, pointer to pointer to the target buffer. 00060 sourceEnd, targetEnd - respectively pointers to the ends of the 00061 two buffers, for overflow checking only. 00062 00063 These conversion functions take a ConversionFlags argument. When this 00064 flag is set to strict, both irregular sequences and isolated surrogates 00065 will cause an error. When the flag is set to lenient, both irregular 00066 sequences and isolated surrogates are converted. 00067 00068 Whether the flag is strict or lenient, all illegal sequences will cause 00069 an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, 00070 or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code 00071 must check for illegal sequences. 00072 00073 When the flag is set to lenient, characters over 0x10FFFF are converted 00074 to the replacement character; otherwise (when the flag is set to strict) 00075 they constitute an error. 00076 00077 Output parameters: 00078 The value "sourceIllegal" is returned from some routines if the input 00079 sequence is malformed. When "sourceIllegal" is returned, the source 00080 value will point to the illegal value that caused the problem. E.g., 00081 in UTF-8 when a sequence is malformed, it points to the start of the 00082 malformed sequence. 00083 00084 Author: Mark E. Davis, 1994. 00085 Rev History: Rick McGowan, fixes & updates May 2001. 00086 Fixes & updates, Sept 2001. 00087 00088 ------------------------------------------------------------------------ */ 00089 00090 #ifndef CLANG_BASIC_CONVERTUTF_H 00091 #define CLANG_BASIC_CONVERTUTF_H 00092 00093 /* --------------------------------------------------------------------- 00094 The following 4 definitions are compiler-specific. 00095 The C standard does not guarantee that wchar_t has at least 00096 16 bits, so wchar_t is no less portable than unsigned short! 00097 All should be unsigned values to avoid sign extension during 00098 bit mask & shift operations. 00099 ------------------------------------------------------------------------ */ 00100 00101 typedef unsigned int UTF32; /* at least 32 bits */ 00102 typedef unsigned short UTF16; /* at least 16 bits */ 00103 typedef unsigned char UTF8; /* typically 8 bits */ 00104 typedef unsigned char Boolean; /* 0 or 1 */ 00105 00106 /* Some fundamental constants */ 00107 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD 00108 #define UNI_MAX_BMP (UTF32)0x0000FFFF 00109 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF 00110 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF 00111 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF 00112 00113 typedef enum { 00114 conversionOK, /* conversion successful */ 00115 sourceExhausted, /* partial character in source, but hit end */ 00116 targetExhausted, /* insuff. room in target for conversion */ 00117 sourceIllegal /* source sequence is illegal/malformed */ 00118 } ConversionResult; 00119 00120 typedef enum { 00121 strictConversion = 0, 00122 lenientConversion 00123 } ConversionFlags; 00124 00125 /* This is for C++ and does no harm in C */ 00126 #ifdef __cplusplus 00127 extern "C" { 00128 #endif 00129 00130 ConversionResult ConvertUTF8toUTF16 ( 00131 const UTF8** sourceStart, const UTF8* sourceEnd, 00132 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); 00133 00134 ConversionResult ConvertUTF8toUTF32 ( 00135 const UTF8** sourceStart, const UTF8* sourceEnd, 00136 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 00137 00138 #ifdef CLANG_NEEDS_THESE_ONE_DAY 00139 ConversionResult ConvertUTF16toUTF8 ( 00140 const UTF16** sourceStart, const UTF16* sourceEnd, 00141 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); 00142 00143 ConversionResult ConvertUTF32toUTF8 ( 00144 const UTF32** sourceStart, const UTF32* sourceEnd, 00145 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); 00146 00147 ConversionResult ConvertUTF16toUTF32 ( 00148 const UTF16** sourceStart, const UTF16* sourceEnd, 00149 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 00150 00151 ConversionResult ConvertUTF32toUTF16 ( 00152 const UTF32** sourceStart, const UTF32* sourceEnd, 00153 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); 00154 #endif 00155 00156 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); 00157 00158 Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd); 00159 00160 #ifdef __cplusplus 00161 } 00162 #endif 00163 00164 #endif 00165 00166 /* --------------------------------------------------------------------- */