mirror of
https://github.com/idolpx/libMeatloaf.git
synced 2025-12-06 04:38:49 -05:00
171 lines
6.9 KiB
C++
171 lines
6.9 KiB
C++
#include "U8Char.h"
|
|
#include "punycode.h"
|
|
|
|
// from https://style64.org/petscii/
|
|
|
|
// PETSCII table in UTF8, non-mappable characters mapped to Private Use Area E000-F8FF
|
|
const char16_t U8Char::utf8map[] = {
|
|
// ---0, ---1, ---2, ---3, ---4, ---5, ---6, ---7, ---8, ---9, --10, --11, --12, --13, --14, --15
|
|
0xE000, 0xE001, 0xE002, 3, 0xE003, 0xE004, 0xE005, 0xE006, 0xE007, 0xE008, 0xE009, 0xE00A, 0xE00B, 10, 0xE00C, 0xE00D,
|
|
0xE00E, 0xE00F, 0xE010, 0xE011, 0x8, 0xE012, 0xE013, 0xE014, 0xE015, 0xE016, 0xE017, 0xE018, 0xE019, 0xE01A, 0xE01B, 0xE01C,
|
|
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, // punct
|
|
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, // numbers
|
|
|
|
0x40,
|
|
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, // a-z
|
|
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a,
|
|
0x5b, 0xa3, 0x5d, 0x2191, 0x2190,
|
|
|
|
0x2500,
|
|
0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, // A-Z
|
|
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a,
|
|
0x253c, 0xE012, 0x2502, 0xE013, 0xE014,
|
|
|
|
0xE015, 0xE016, 0xE017, 0xE018, 0xE019, 0xE01A, 0xE01B, 0xE01C, 0xE01D, 0xE01E, 0xE01F, 0xE020, 0xE021, 0x2028, 0xE022, 0xE023, // control codes
|
|
0xE024, 0xE025, 0xE026, 0xE027, 0xE028, 0xE029, 0xE02A, 0xE02B, 0xE02C, 0xE02D, 0xE02E, 0xE02F, 0xE030, 0xE031, 0xE032, 0xE033,
|
|
|
|
0xa0, 0x258c, 0x2584, 0x2594, 0x2581, 0x258e, 0x2592, 0xE034, 0xE035, 0xE036, 0xE037, 0x251c, 0x2597, 0x2514, 0x2510, 0x2582, // tables etc.
|
|
0x250c, 0x2534, 0x252c, 0x2524, 0x258e, 0x258d, 0xE038, 0xE039, 0xE03A, 0x2583, 0x2713, 0x2596, 0x259d, 0x2518, 0x2598, 0x259a,
|
|
0x2500, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, // A-Z
|
|
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x253c, 0xE03B, 0x2502, 0xE03C, 0xE03D,
|
|
0xa0, 0x258c, 0x2584, 0x2594, 0x2581, 0x258e, 0x2592, 0xE03F, 0xE040, 0xE041, 0xE042, 0x251c, 0x2597, 0x2514, 0x2510, 0x2582, // tables etc.
|
|
0x250c, 0x2534, 0x252c, 0x2524, 0x258e, 0x258d, 0xE043, 0xE044, 0xE045, 0x2583, 0x2713, 0x2596, 0x259d, 0x2518, 0x2598, 0xE046
|
|
|
|
};
|
|
|
|
void U8Char::fromUtf8Stream(std::istream* reader) {
|
|
uint8_t byte = reader->get();
|
|
if(byte<=0x7f) {
|
|
ch = byte;
|
|
}
|
|
else if((byte & 0b11100000) == 0b11000000) {
|
|
uint16_t hi = ((uint16_t)(byte & 0b1111)) << 6;
|
|
uint16_t lo = (reader->get() & 0b111111);
|
|
ch = hi | lo;
|
|
}
|
|
else if((byte & 0b11110000) == 0b11100000) {
|
|
uint16_t hi = ((uint16_t)(byte & 0b111)) << 12;
|
|
uint16_t mi = ((uint16_t)(reader->get() & 0b111111)) << 6;
|
|
uint16_t lo = reader->get() & 0b111111;
|
|
ch = hi | mi | lo;
|
|
}
|
|
else {
|
|
ch = 0;
|
|
}
|
|
};
|
|
|
|
size_t U8Char::fromCharArray(char* reader) {
|
|
uint8_t byte = reader[0];
|
|
if(byte<=0x7f) {
|
|
ch = byte;
|
|
return 1;
|
|
}
|
|
else if((byte & 0b11100000) == 0b11000000) {
|
|
uint16_t hi = ((uint16_t)(byte & 0b1111)) << 6;
|
|
uint16_t lo = (reader[1] & 0b111111);
|
|
ch = hi | lo;
|
|
return 2;
|
|
}
|
|
else if((byte & 0b11110000) == 0b11100000) {
|
|
uint16_t hi = ((uint16_t)(byte & 0b111)) << 12;
|
|
uint16_t mi = ((uint16_t)(reader[1] & 0b111111)) << 6;
|
|
uint16_t lo = reader[2] & 0b111111;
|
|
ch = hi | mi | lo;
|
|
return 3;
|
|
}
|
|
else {
|
|
ch = 0;
|
|
return 1;
|
|
}
|
|
};
|
|
|
|
std::string U8Char::toUtf8() {
|
|
if(ch==0) {
|
|
return std::string(1, missing);
|
|
}
|
|
else if(ch>=0x01 && ch<=0x7f) {
|
|
// For code points in the range 0x0000 to 0x007F (1-byte sequences):
|
|
// Directly represent the code point in binary.
|
|
// Format: 0xxxxxxx
|
|
return std::string(1, char(ch));
|
|
}
|
|
else if(ch>=0x80 && ch<=0x7ff) {
|
|
// First byte: 110xxxxx, where the x's are the first 5 bits of the code point.
|
|
char upper = (uint8_t)((ch>>6) & 0b11111) | 0b11000000;
|
|
// Second byte: 10xxxxxx, where the x's are the next 6 bits of the code point.
|
|
auto lower = (uint8_t)(ch & 0b111111) | 0b10000000;
|
|
char arr[] = { (char)upper, (char)lower, '\0'};
|
|
return std::string(arr);
|
|
}
|
|
else {
|
|
// First byte: 1110xxxx, where the x's are the first 4 bits of the code point.
|
|
auto hi = (uint8_t)((ch>>12) & 0b00001111) | 0b11100000;
|
|
// Second byte: 10xxxxxx, where the x's are the next 6 bits.
|
|
auto mid = (uint8_t)((ch>>6) & 0b00111111) | 0b10000000;
|
|
// Third byte: 10xxxxxx, where the x's are the last 6 bits.
|
|
auto lower = (uint8_t)(ch & 0b00111111) | 0b10000000;
|
|
char arr[] = { (char)hi, (char)mid, (char)lower, '\0'};
|
|
return std::string(arr);
|
|
}
|
|
}
|
|
|
|
uint8_t U8Char::toPetscii() {
|
|
for(int i = 0; i<256; i++) {
|
|
if(utf8map[i]==ch)
|
|
return i;
|
|
}
|
|
return missing;
|
|
}
|
|
|
|
// for punycode we need utf8 converted to uint32_t
|
|
// workflows:
|
|
// char* ascii_punycode -> uint32_t* -> char* utf8
|
|
// char* utf8 -> uint32_t* -> char* ascii_punycode
|
|
|
|
// convert utf8 encoded string to array of uint32_t, return length of output_unicode32
|
|
size_t U8Char::toUnicode32(std::string& input_utf8, uint32_t* output_unicode32, size_t max_output_length) {
|
|
size_t input_length = input_utf8.length();
|
|
size_t output_length = 0;
|
|
size_t i = 0;
|
|
char* asChar = (char *)input_utf8.c_str();
|
|
|
|
while(i<input_length && output_length<max_output_length) {
|
|
U8Char ch(' ');
|
|
size_t skip = ch.fromCharArray(asChar+i);
|
|
output_unicode32[output_length++] = ch.ch;
|
|
i += skip;
|
|
}
|
|
return output_length;
|
|
}
|
|
|
|
// convert array of uint32_t to utf8 encoded string, return utf8 string
|
|
std::string U8Char::fromUnicode32(uint32_t* input_unicode32, size_t input_length) {
|
|
std::string output_utf8;
|
|
for(size_t i = 0; i<input_length; i++) {
|
|
U8Char ch((uint16_t)input_unicode32[i]);
|
|
output_utf8 += ch.toUtf8();
|
|
}
|
|
return output_utf8;
|
|
}
|
|
|
|
std::string U8Char::toPunycode(std::string utf8String) {
|
|
uint32_t asU32[1024];
|
|
char asPunycode[1024];
|
|
size_t dstlen = sizeof asPunycode;
|
|
size_t n_converted;
|
|
U8Char temp(' ');
|
|
|
|
size_t conv_len = temp.toUnicode32(utf8String, asU32, sizeof asU32);
|
|
n_converted = punycode_encode(asU32, conv_len, asPunycode, &dstlen);
|
|
return std::string(asPunycode, n_converted);
|
|
}
|
|
|
|
|
|
std::string U8Char::fromPunycode(std::string punycodeString) {
|
|
uint32_t asU32[1024];
|
|
size_t dstlen = sizeof asU32;
|
|
U8Char temp(' ');
|
|
|
|
punycode_decode(punycodeString.c_str(), punycodeString.length(), asU32, &dstlen);
|
|
return temp.fromUnicode32(asU32, dstlen);
|
|
} |