rakata_core/
text.rs

1use encoding_rs::{
2    DecoderResult, Encoding, BIG5, EUC_KR, GBK, SHIFT_JIS, WINDOWS_1250, WINDOWS_1251,
3    WINDOWS_1252, WINDOWS_1253, WINDOWS_1254, WINDOWS_1255, WINDOWS_1256, WINDOWS_1257,
4    WINDOWS_1258, WINDOWS_874,
5};
6use thiserror::Error;
7
8use crate::LanguageId;
9
10/// Text encodings currently used by supported KotOR formats.
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
13pub enum TextEncoding {
14    /// Windows-1250 (Central/Eastern Europe).
15    Windows1250,
16    /// Windows-1251 (Cyrillic).
17    Windows1251,
18    /// Windows-1252 single-byte encoding used by TLK and many text payloads.
19    Windows1252,
20    /// Windows-1253 (Greek).
21    Windows1253,
22    /// Windows-1254 (Turkish).
23    Windows1254,
24    /// Windows-1255 (Hebrew).
25    Windows1255,
26    /// Windows-1256 (Arabic).
27    Windows1256,
28    /// Windows-1257 (Baltic).
29    Windows1257,
30    /// Windows-1258 (Vietnamese).
31    Windows1258,
32    /// Windows-874 (Thai).
33    Windows874,
34    /// Shift-JIS (Japanese, cp932-compatible family).
35    ShiftJis,
36    /// GBK (Simplified Chinese, cp936-compatible family).
37    Gbk,
38    /// Big5 (Traditional Chinese, cp950-compatible family).
39    Big5,
40    /// EUC-KR (Korean, cp949-compatible family).
41    EucKr,
42}
43
44/// Error returned when strict text encoding would lose information.
45#[derive(Debug, Clone, PartialEq, Eq, Error)]
46#[error("character {character:?} at char index {char_index} is not encodable as {encoding:?}")]
47pub struct EncodeTextError {
48    /// Character position (by `char` index) that could not be encoded.
49    pub char_index: usize,
50    /// Unicode character that is not representable in the target encoding.
51    pub character: char,
52    /// Target encoding that rejected the character.
53    pub encoding: TextEncoding,
54}
55
56/// Error returned when strict byte decoding encounters malformed input.
57#[derive(Debug, Clone, PartialEq, Eq, Error)]
58#[error("malformed byte sequence at byte index {byte_index} while decoding {encoding:?}")]
59pub struct DecodeTextError {
60    /// Byte offset where decoding first failed.
61    pub byte_index: usize,
62    /// Source encoding that rejected the byte sequence.
63    pub encoding: TextEncoding,
64}
65
66/// Error returned when a language ID does not map to a supported text encoding.
67#[derive(Debug, Clone, PartialEq, Eq, Error)]
68#[error("unsupported language id {} for text encoding", language_id.raw())]
69pub struct LanguageEncodingError {
70    /// Unsupported language ID.
71    pub language_id: LanguageId,
72}
73
74/// Decodes bytes into Unicode text.
75///
76/// For single-byte encodings such as Windows-1252, decoding is lossless across
77/// all byte values.
78pub fn decode_text(bytes: &[u8], encoding: TextEncoding) -> String {
79    let (decoded, _actual, _had_errors) = encoding_rs_codec(encoding).decode(bytes);
80    decoded.into_owned()
81}
82
83/// Decodes bytes into Unicode text using strict lossless behavior.
84///
85/// If the input contains malformed byte sequences for the target encoding,
86/// this function returns [`DecodeTextError`] instead of inserting replacements.
87pub fn decode_text_strict(bytes: &[u8], encoding: TextEncoding) -> Result<String, DecodeTextError> {
88    let codec = encoding_rs_codec(encoding);
89    let Some(decoded) = codec.decode_without_bom_handling_and_without_replacement(bytes) else {
90        let byte_index = first_malformed_byte_index(bytes, codec);
91        return Err(DecodeTextError {
92            byte_index,
93            encoding,
94        });
95    };
96    Ok(decoded.into_owned())
97}
98
99/// Encodes Unicode text into bytes using strict lossless behavior.
100///
101/// If any input character is not representable in the selected encoding, this
102/// function returns [`EncodeTextError`] instead of replacing data.
103pub fn encode_text(input: &str, encoding: TextEncoding) -> Result<Vec<u8>, EncodeTextError> {
104    let codec = encoding_rs_codec(encoding);
105    let (encoded, _actual, had_errors) = codec.encode(input);
106    if !had_errors {
107        return Ok(encoded.into_owned());
108    }
109
110    let (char_index, character) = first_unencodable(input, codec);
111    Err(EncodeTextError {
112        char_index,
113        character,
114        encoding,
115    })
116}
117
118/// Resolves the text encoding for a KotOR language ID.
119///
120/// This mapping is shared by TLK and GFF localized-string paths so behavior
121/// stays centralized and avoids cross-crate drift.
122pub fn text_encoding_for_language(
123    language_id: impl Into<LanguageId>,
124) -> Result<TextEncoding, LanguageEncodingError> {
125    let language_id = language_id.into();
126    let encoding = match language_id.raw() {
127        // Official KotOR releases.
128        0..=4 => TextEncoding::Windows1252,
129        5 => TextEncoding::Windows1250,
130        // Central/Eastern European.
131        32..=40 | 104 => TextEncoding::Windows1250,
132        // Cyrillic.
133        41..=46 => TextEncoding::Windows1251,
134        // Greek.
135        47 => TextEncoding::Windows1253,
136        // Turkish-family.
137        48..=50 => TextEncoding::Windows1254,
138        // Hebrew.
139        51 => TextEncoding::Windows1255,
140        // Arabic-family.
141        52..=54 => TextEncoding::Windows1256,
142        // Baltic.
143        55..=57 | 105 => TextEncoding::Windows1257,
144        // Vietnamese.
145        58 => TextEncoding::Windows1258,
146        // Thai.
147        59 => TextEncoding::Windows874,
148        // East Asian language families.
149        128 => TextEncoding::EucKr,
150        129 => TextEncoding::Big5,
151        130 => TextEncoding::Gbk,
152        131 => TextEncoding::ShiftJis,
153        // Optional enhancement track: add support for language IDs 70..=72
154        // (Armenian/Georgian/Tamil legacy codepages) only if downstream
155        // extended-localization use cases require it.
156        70..=72 => {
157            return Err(LanguageEncodingError { language_id });
158        }
159        // Defaults to Western European behavior used by most custom IDs.
160        _ => TextEncoding::Windows1252,
161    };
162    Ok(encoding)
163}
164
165fn encoding_rs_codec(encoding: TextEncoding) -> &'static Encoding {
166    match encoding {
167        TextEncoding::Windows1250 => WINDOWS_1250,
168        TextEncoding::Windows1251 => WINDOWS_1251,
169        TextEncoding::Windows1252 => WINDOWS_1252,
170        TextEncoding::Windows1253 => WINDOWS_1253,
171        TextEncoding::Windows1254 => WINDOWS_1254,
172        TextEncoding::Windows1255 => WINDOWS_1255,
173        TextEncoding::Windows1256 => WINDOWS_1256,
174        TextEncoding::Windows1257 => WINDOWS_1257,
175        TextEncoding::Windows1258 => WINDOWS_1258,
176        TextEncoding::Windows874 => WINDOWS_874,
177        TextEncoding::ShiftJis => SHIFT_JIS,
178        TextEncoding::Gbk => GBK,
179        TextEncoding::Big5 => BIG5,
180        TextEncoding::EucKr => EUC_KR,
181    }
182}
183
184fn first_unencodable(input: &str, codec: &'static Encoding) -> (usize, char) {
185    let mut buf = [0u8; 4];
186    for (char_index, ch) in input.chars().enumerate() {
187        let s = ch.encode_utf8(&mut buf);
188        let (_encoded, _actual, had_errors) = codec.encode(s);
189        if had_errors {
190            return (char_index, ch);
191        }
192    }
193    (0, '\u{FFFD}')
194}
195
196fn first_malformed_byte_index(bytes: &[u8], codec: &'static Encoding) -> usize {
197    let mut decoder = codec.new_decoder_without_bom_handling();
198    let mut output = String::new();
199    let mut input = bytes;
200    let mut consumed_total = 0usize;
201
202    loop {
203        let reserve = decoder
204            .max_utf8_buffer_length_without_replacement(input.len())
205            .unwrap_or(input.len().saturating_mul(4).saturating_add(16));
206        output.reserve(reserve.max(8));
207
208        let (result, read) = decoder.decode_to_string_without_replacement(input, &mut output, true);
209        consumed_total = consumed_total.saturating_add(read);
210        input = &input[read..];
211
212        match result {
213            DecoderResult::InputEmpty => return bytes.len(),
214            DecoderResult::OutputFull => continue,
215            DecoderResult::Malformed(_, _) => return consumed_total,
216        }
217    }
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223
224    #[test]
225    fn windows_1252_roundtrip_is_lossless_for_supported_text() {
226        let original = "“Hello” € test";
227        let encoded = encode_text(original, TextEncoding::Windows1252).expect("must encode");
228        let decoded = decode_text(&encoded, TextEncoding::Windows1252);
229        assert_eq!(decoded, original);
230    }
231
232    #[test]
233    fn strict_encoder_rejects_unencodable_characters() {
234        let err = encode_text("hello 😀", TextEncoding::Windows1252).expect_err("must fail");
235        assert_eq!(err.character, '😀');
236    }
237
238    #[test]
239    fn windows_1250_roundtrip_is_lossless_for_supported_text() {
240        let original = "Zażółć gęślą jaźń";
241        let encoded = encode_text(original, TextEncoding::Windows1250).expect("must encode");
242        let decoded = decode_text(&encoded, TextEncoding::Windows1250);
243        assert_eq!(decoded, original);
244    }
245
246    #[test]
247    fn strict_decoder_rejects_malformed_multibyte_sequences() {
248        let err = decode_text_strict(&[0x81], TextEncoding::ShiftJis).expect_err("must fail");
249        assert_eq!(err.encoding, TextEncoding::ShiftJis);
250    }
251
252    #[test]
253    fn strict_decoder_roundtrips_valid_multibyte_sequences() {
254        let encoded = encode_text("テスト", TextEncoding::ShiftJis).expect("must encode");
255        let decoded = decode_text_strict(&encoded, TextEncoding::ShiftJis).expect("must decode");
256        assert_eq!(decoded, "テスト");
257    }
258
259    #[test]
260    fn language_id_mapping_returns_expected_encodings() {
261        assert_eq!(
262            text_encoding_for_language(0).expect("english"),
263            TextEncoding::Windows1252
264        );
265        assert_eq!(
266            text_encoding_for_language(5).expect("polish"),
267            TextEncoding::Windows1250
268        );
269        assert_eq!(
270            text_encoding_for_language(59).expect("thai"),
271            TextEncoding::Windows874
272        );
273        assert_eq!(
274            text_encoding_for_language(131).expect("japanese"),
275            TextEncoding::ShiftJis
276        );
277    }
278
279    #[test]
280    fn language_id_mapping_rejects_unsupported_legacy_codepages() {
281        let err = text_encoding_for_language(70).expect_err("must fail");
282        assert_eq!(err.language_id, LanguageId::from_raw(70));
283    }
284}