clang 23.0.0git
Encoding.h
Go to the documentation of this file.
1//===--- Encoding.h - Format C++ code ---------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// Contains functions for text encoding manipulation. Supports UTF-8,
11/// 8-bit encodings and escape sequences in C++ string literals.
12///
13//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H
16#define LLVM_CLANG_LIB_FORMAT_ENCODING_H
17
18#include "clang/Basic/LLVM.h"
19#include "llvm/Support/ConvertUTF.h"
20#include "llvm/Support/Unicode.h"
21
22namespace clang {
23namespace format {
24namespace encoding {
25
28 Encoding_Unknown // We treat all other encodings as 8-bit encodings.
29};
30
31/// Detects encoding of the Text. If the Text can be decoded using UTF-8,
32/// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
33inline Encoding detectEncoding(StringRef Text) {
34 const llvm::UTF8 *Ptr = reinterpret_cast<const llvm::UTF8 *>(Text.begin());
35 const llvm::UTF8 *BufEnd = reinterpret_cast<const llvm::UTF8 *>(Text.end());
36 if (llvm::isLegalUTF8String(&Ptr, BufEnd))
37 return Encoding_UTF8;
38 return Encoding_Unknown;
39}
40
41/// Returns the number of columns required to display the \p Text on a
42/// generic Unicode-capable terminal. Text is assumed to use the specified
43/// \p Encoding.
44inline unsigned columnWidth(StringRef Text, Encoding Encoding) {
45 if (Encoding == Encoding_UTF8) {
46 int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
47 // FIXME: Figure out the correct way to handle this in the presence of both
48 // printable and unprintable multi-byte UTF-8 characters. Falling back to
49 // returning the number of bytes may cause problems, as columnWidth suddenly
50 // becomes non-additive.
51 if (ContentWidth >= 0)
52 return ContentWidth;
53 }
54 return Text.size();
55}
56
57/// Returns the number of columns required to display the \p Text,
58/// starting from the \p StartColumn on a terminal with the \p TabWidth. The
59/// text is assumed to use the specified \p Encoding.
60inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn,
61 unsigned TabWidth, Encoding Encoding) {
62 unsigned TotalWidth = 0;
63 StringRef Tail = Text;
64 for (;;) {
65 StringRef::size_type TabPos = Tail.find('\t');
66 if (TabPos == StringRef::npos)
67 return TotalWidth + columnWidth(Tail, Encoding);
68 TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
69 if (TabWidth)
70 TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
71 Tail = Tail.substr(TabPos + 1);
72 }
73}
74
75/// Gets the number of bytes in a sequence representing a single
76/// codepoint and starting with FirstChar in the specified Encoding.
77inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
78 switch (Encoding) {
79 case Encoding_UTF8:
80 return llvm::getNumBytesForUTF8(FirstChar);
81 default:
82 return 1;
83 }
84}
85
86inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; }
87
88inline bool isHexDigit(char c) {
89 return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
90 ('A' <= c && c <= 'F');
91}
92
93/// Gets the length of an escape sequence inside a C++ string literal.
94/// Text should span from the beginning of the escape sequence (starting with a
95/// backslash) to the end of the string literal.
96inline unsigned getEscapeSequenceLength(StringRef Text) {
97 assert(Text[0] == '\\');
98 if (Text.size() < 2)
99 return 1;
100
101 switch (Text[1]) {
102 case 'u':
103 return 6;
104 case 'U':
105 return 10;
106 case 'x': {
107 unsigned I = 2; // Point after '\x'.
108 while (I < Text.size() && isHexDigit(Text[I]))
109 ++I;
110 return I;
111 }
112 default:
113 if (isOctDigit(Text[1])) {
114 unsigned I = 1;
115 while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
116 ++I;
117 return I;
118 }
119 return 1 + llvm::getNumBytesForUTF8(Text[1]);
120 }
121}
122
123} // namespace encoding
124} // namespace format
125} // namespace clang
126
127#endif
Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition Encoding.h:60
bool isOctDigit(char c)
Definition Encoding.h:86
Encoding detectEncoding(StringRef Text)
Detects encoding of the Text.
Definition Encoding.h:33
unsigned getEscapeSequenceLength(StringRef Text)
Gets the length of an escape sequence inside a C++ string literal.
Definition Encoding.h:96
bool isHexDigit(char c)
Definition Encoding.h:88
unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding)
Gets the number of bytes in a sequence representing a single codepoint and starting with FirstChar in...
Definition Encoding.h:77
unsigned columnWidth(StringRef Text, Encoding Encoding)
Returns the number of columns required to display the Text on a generic Unicode-capable terminal.
Definition Encoding.h:44
The JSON file list parser is used to communicate input to InstallAPI.
unsigned TabWidth
The number of columns used for tab stops.
Definition Format.h:5914