1use encoding_rs::{
2 DecoderResult, Encoding, BIG5, EUC_KR, GBK, SHIFT_JIS, WINDOWS_1250, WINDOWS_1251,
3 WINDOWS_1252, WINDOWS_1253, WINDOWS_1254, WINDOWS_1255, WINDOWS_1256, WINDOWS_1257,
4 WINDOWS_1258, WINDOWS_874,
5};
6use thiserror::Error;
7
8use crate::LanguageId;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
13pub enum TextEncoding {
14 Windows1250,
16 Windows1251,
18 Windows1252,
20 Windows1253,
22 Windows1254,
24 Windows1255,
26 Windows1256,
28 Windows1257,
30 Windows1258,
32 Windows874,
34 ShiftJis,
36 Gbk,
38 Big5,
40 EucKr,
42}
43
44#[derive(Debug, Clone, PartialEq, Eq, Error)]
46#[error("character {character:?} at char index {char_index} is not encodable as {encoding:?}")]
47pub struct EncodeTextError {
48 pub char_index: usize,
50 pub character: char,
52 pub encoding: TextEncoding,
54}
55
56#[derive(Debug, Clone, PartialEq, Eq, Error)]
58#[error("malformed byte sequence at byte index {byte_index} while decoding {encoding:?}")]
59pub struct DecodeTextError {
60 pub byte_index: usize,
62 pub encoding: TextEncoding,
64}
65
66#[derive(Debug, Clone, PartialEq, Eq, Error)]
68#[error("unsupported language id {} for text encoding", language_id.raw())]
69pub struct LanguageEncodingError {
70 pub language_id: LanguageId,
72}
73
74pub fn decode_text(bytes: &[u8], encoding: TextEncoding) -> String {
79 let (decoded, _actual, _had_errors) = encoding_rs_codec(encoding).decode(bytes);
80 decoded.into_owned()
81}
82
83pub fn decode_text_strict(bytes: &[u8], encoding: TextEncoding) -> Result<String, DecodeTextError> {
88 let codec = encoding_rs_codec(encoding);
89 let Some(decoded) = codec.decode_without_bom_handling_and_without_replacement(bytes) else {
90 let byte_index = first_malformed_byte_index(bytes, codec);
91 return Err(DecodeTextError {
92 byte_index,
93 encoding,
94 });
95 };
96 Ok(decoded.into_owned())
97}
98
99pub fn encode_text(input: &str, encoding: TextEncoding) -> Result<Vec<u8>, EncodeTextError> {
104 let codec = encoding_rs_codec(encoding);
105 let (encoded, _actual, had_errors) = codec.encode(input);
106 if !had_errors {
107 return Ok(encoded.into_owned());
108 }
109
110 let (char_index, character) = first_unencodable(input, codec);
111 Err(EncodeTextError {
112 char_index,
113 character,
114 encoding,
115 })
116}
117
118pub fn text_encoding_for_language(
123 language_id: impl Into<LanguageId>,
124) -> Result<TextEncoding, LanguageEncodingError> {
125 let language_id = language_id.into();
126 let encoding = match language_id.raw() {
127 0..=4 => TextEncoding::Windows1252,
129 5 => TextEncoding::Windows1250,
130 32..=40 | 104 => TextEncoding::Windows1250,
132 41..=46 => TextEncoding::Windows1251,
134 47 => TextEncoding::Windows1253,
136 48..=50 => TextEncoding::Windows1254,
138 51 => TextEncoding::Windows1255,
140 52..=54 => TextEncoding::Windows1256,
142 55..=57 | 105 => TextEncoding::Windows1257,
144 58 => TextEncoding::Windows1258,
146 59 => TextEncoding::Windows874,
148 128 => TextEncoding::EucKr,
150 129 => TextEncoding::Big5,
151 130 => TextEncoding::Gbk,
152 131 => TextEncoding::ShiftJis,
153 70..=72 => {
157 return Err(LanguageEncodingError { language_id });
158 }
159 _ => TextEncoding::Windows1252,
161 };
162 Ok(encoding)
163}
164
165fn encoding_rs_codec(encoding: TextEncoding) -> &'static Encoding {
166 match encoding {
167 TextEncoding::Windows1250 => WINDOWS_1250,
168 TextEncoding::Windows1251 => WINDOWS_1251,
169 TextEncoding::Windows1252 => WINDOWS_1252,
170 TextEncoding::Windows1253 => WINDOWS_1253,
171 TextEncoding::Windows1254 => WINDOWS_1254,
172 TextEncoding::Windows1255 => WINDOWS_1255,
173 TextEncoding::Windows1256 => WINDOWS_1256,
174 TextEncoding::Windows1257 => WINDOWS_1257,
175 TextEncoding::Windows1258 => WINDOWS_1258,
176 TextEncoding::Windows874 => WINDOWS_874,
177 TextEncoding::ShiftJis => SHIFT_JIS,
178 TextEncoding::Gbk => GBK,
179 TextEncoding::Big5 => BIG5,
180 TextEncoding::EucKr => EUC_KR,
181 }
182}
183
184fn first_unencodable(input: &str, codec: &'static Encoding) -> (usize, char) {
185 let mut buf = [0u8; 4];
186 for (char_index, ch) in input.chars().enumerate() {
187 let s = ch.encode_utf8(&mut buf);
188 let (_encoded, _actual, had_errors) = codec.encode(s);
189 if had_errors {
190 return (char_index, ch);
191 }
192 }
193 (0, '\u{FFFD}')
194}
195
196fn first_malformed_byte_index(bytes: &[u8], codec: &'static Encoding) -> usize {
197 let mut decoder = codec.new_decoder_without_bom_handling();
198 let mut output = String::new();
199 let mut input = bytes;
200 let mut consumed_total = 0usize;
201
202 loop {
203 let reserve = decoder
204 .max_utf8_buffer_length_without_replacement(input.len())
205 .unwrap_or(input.len().saturating_mul(4).saturating_add(16));
206 output.reserve(reserve.max(8));
207
208 let (result, read) = decoder.decode_to_string_without_replacement(input, &mut output, true);
209 consumed_total = consumed_total.saturating_add(read);
210 input = &input[read..];
211
212 match result {
213 DecoderResult::InputEmpty => return bytes.len(),
214 DecoderResult::OutputFull => continue,
215 DecoderResult::Malformed(_, _) => return consumed_total,
216 }
217 }
218}
219
220#[cfg(test)]
221mod tests {
222 use super::*;
223
224 #[test]
225 fn windows_1252_roundtrip_is_lossless_for_supported_text() {
226 let original = "“Hello” € test";
227 let encoded = encode_text(original, TextEncoding::Windows1252).expect("must encode");
228 let decoded = decode_text(&encoded, TextEncoding::Windows1252);
229 assert_eq!(decoded, original);
230 }
231
232 #[test]
233 fn strict_encoder_rejects_unencodable_characters() {
234 let err = encode_text("hello 😀", TextEncoding::Windows1252).expect_err("must fail");
235 assert_eq!(err.character, '😀');
236 }
237
238 #[test]
239 fn windows_1250_roundtrip_is_lossless_for_supported_text() {
240 let original = "Zażółć gęślą jaźń";
241 let encoded = encode_text(original, TextEncoding::Windows1250).expect("must encode");
242 let decoded = decode_text(&encoded, TextEncoding::Windows1250);
243 assert_eq!(decoded, original);
244 }
245
246 #[test]
247 fn strict_decoder_rejects_malformed_multibyte_sequences() {
248 let err = decode_text_strict(&[0x81], TextEncoding::ShiftJis).expect_err("must fail");
249 assert_eq!(err.encoding, TextEncoding::ShiftJis);
250 }
251
252 #[test]
253 fn strict_decoder_roundtrips_valid_multibyte_sequences() {
254 let encoded = encode_text("テスト", TextEncoding::ShiftJis).expect("must encode");
255 let decoded = decode_text_strict(&encoded, TextEncoding::ShiftJis).expect("must decode");
256 assert_eq!(decoded, "テスト");
257 }
258
259 #[test]
260 fn language_id_mapping_returns_expected_encodings() {
261 assert_eq!(
262 text_encoding_for_language(0).expect("english"),
263 TextEncoding::Windows1252
264 );
265 assert_eq!(
266 text_encoding_for_language(5).expect("polish"),
267 TextEncoding::Windows1250
268 );
269 assert_eq!(
270 text_encoding_for_language(59).expect("thai"),
271 TextEncoding::Windows874
272 );
273 assert_eq!(
274 text_encoding_for_language(131).expect("japanese"),
275 TextEncoding::ShiftJis
276 );
277 }
278
279 #[test]
280 fn language_id_mapping_rejects_unsupported_legacy_codepages() {
281 let err = text_encoding_for_language(70).expect_err("must fail");
282 assert_eq!(err.language_id, LanguageId::from_raw(70));
283 }
284}