1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2005-2012, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#ifndef __CSRMBCS_H
#define __CSRMBCS_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
#include "csrecog.h"
U_NAMESPACE_BEGIN
// "Character" iterated character class.
// Recognizers for specific mbcs encodings make their "characters" available
// by providing a nextChar() function that fills in an instance of IteratedChar
// with the next char from the input.
// The returned characters are not converted to Unicode, but remain as the raw
// bytes (concatenated into an int) from the codepage data.
//
// For Asian charsets, use the raw input rather than the input that has been
// stripped of markup. Detection only considers multi-byte chars, effectively
// stripping markup anyway, and double byte chars do occur in markup too.
//
class IteratedChar : public UMemory
{
public:
uint32_t charValue; // 1-4 bytes from the raw input data
int32_t index;
int32_t nextIndex;
UBool error;
UBool done;
public:
IteratedChar();
//void reset();
int32_t nextByte(InputText* det);
};
class CharsetRecog_mbcs : public CharsetRecognizer {
protected:
/**
* Test the match of this charset with the input text data
* which is obtained via the CharsetDetector object.
*
* @param det The CharsetDetector, which contains the input text
* to be checked for being in this charset.
* @return Two values packed into one int (Damn java, anyhow)
* <br/>
* bits 0-7: the match confidence, ranging from 0-100
* <br/>
* bits 8-15: The match reason, an enum-like value.
*/
int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const;
public:
virtual ~CharsetRecog_mbcs();
/**
* Get the IANA name of this charset.
* @return the charset name.
*/
const char *getName() const override = 0;
const char *getLanguage() const override = 0;
UBool match(InputText* input, CharsetMatch *results) const override = 0;
/**
* Get the next character (however many bytes it is) from the input data
* Subclasses for specific charset encodings must implement this function
* to get characters according to the rules of their encoding scheme.
*
* This function is not a method of class IteratedChar only because
* that would require a lot of extra derived classes, which is awkward.
* @param it The IteratedChar "struct" into which the returned char is placed.
* @param det The charset detector, which is needed to get at the input byte data
* being iterated over.
* @return True if a character was returned, false at end of input.
*/
virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0;
};
/**
* Shift-JIS charset recognizer.
*
*/
class CharsetRecog_sjis : public CharsetRecog_mbcs {
public:
virtual ~CharsetRecog_sjis();
UBool nextChar(IteratedChar *it, InputText *det) const override;
UBool match(InputText* input, CharsetMatch *results) const override;
const char *getName() const override;
const char *getLanguage() const override;
};
/**
* EUC charset recognizers. One abstract class that provides the common function
* for getting the next character according to the EUC encoding scheme,
* and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
*
*/
class CharsetRecog_euc : public CharsetRecog_mbcs
{
public:
virtual ~CharsetRecog_euc();
const char *getName() const override = 0;
const char *getLanguage() const override = 0;
UBool match(InputText* input, CharsetMatch *results) const override = 0;
/*
* (non-Javadoc)
* Get the next character value for EUC based encodings.
* Character "value" is simply the raw bytes that make up the character
* packed into an int.
*/
UBool nextChar(IteratedChar *it, InputText *det) const override;
};
/**
* The charset recognize for EUC-JP. A singleton instance of this class
* is created and kept by the public CharsetDetector class
*/
class CharsetRecog_euc_jp : public CharsetRecog_euc
{
public:
virtual ~CharsetRecog_euc_jp();
const char *getName() const override;
const char *getLanguage() const override;
UBool match(InputText* input, CharsetMatch *results) const override;
};
/**
* The charset recognize for EUC-KR. A singleton instance of this class
* is created and kept by the public CharsetDetector class
*/
class CharsetRecog_euc_kr : public CharsetRecog_euc
{
public:
virtual ~CharsetRecog_euc_kr();
const char *getName() const override;
const char *getLanguage() const override;
UBool match(InputText* input, CharsetMatch *results) const override;
};
/**
*
* Big5 charset recognizer.
*
*/
class CharsetRecog_big5 : public CharsetRecog_mbcs
{
public:
virtual ~CharsetRecog_big5();
UBool nextChar(IteratedChar* it, InputText* det) const override;
const char *getName() const override;
const char *getLanguage() const override;
UBool match(InputText* input, CharsetMatch *results) const override;
};
/**
*
* GB-18030 recognizer. Uses simplified Chinese statistics.
*
*/
class CharsetRecog_gb_18030 : public CharsetRecog_mbcs
{
public:
virtual ~CharsetRecog_gb_18030();
UBool nextChar(IteratedChar* it, InputText* det) const override;
const char *getName() const override;
const char *getLanguage() const override;
UBool match(InputText* input, CharsetMatch *results) const override;
};
U_NAMESPACE_END
#endif
#endif /* __CSRMBCS_H */
|