badvpn/misc/unicode_funcs.h
2012-01-27 13:24:37 +00:00

233 lines
7.4 KiB
C

/**
* @file unicode_funcs.h
* @author Ambroz Bizjak <ambrop7@gmail.com>
*
* @section LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the author nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BADVPN_UNICODE_FUNCS_H
#define BADVPN_UNICODE_FUNCS_H
#include <misc/expstring.h>
#include <misc/bsize.h>
#include <misc/Utf8Encoder.h>
#include <misc/Utf8Decoder.h>
#include <misc/Utf16Encoder.h>
#include <misc/Utf16Decoder.h>
/**
* Decodes UTF-16 data as bytes into an allocated null-terminated UTF-8 string.
*
* @param data UTF-16 data, in big endian
* @param data_len size of data in bytes
* @param out_is_error if not NULL and the function returns a string,
* *out_is_error will be set to 0 or 1, indicating
* whether there have been errors decoding the input.
* A null decoded character is treated as an error.
* @return An UTF-8 null-terminated string which can be freed with free(),
* or NULL if out of memory.
*/
static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error);
/**
* Decodes UTF-8 data into UTF-16 data as bytes.
*
* @param data UTF-8 data
* @param data_len size of data in bytes
* @param out output buffer
* @param out_avail number of bytes available in output buffer
* @param out_len if not NULL, *out_len will contain the number of bytes
* required to store the resulting data (or overflow)
* @param out_is_error if not NULL, *out_is_error will contain 0 or 1,
* indicating whether there have been errors decoding
* the input
*/
static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error);
static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error)
{
// will build the resulting UTF-8 string by appending to ExpString
ExpString str;
if (!ExpString_Init(&str)) {
goto fail0;
}
// init UTF-16 decoder
Utf16Decoder decoder;
Utf16Decoder_Init(&decoder);
// set initial input and input matching positions
size_t i_in = 0;
size_t i_ch = 0;
int error = 0;
while (i_in < data_len) {
// read two input bytes from the input position
uint8_t x = data[i_in++];
if (i_in == data_len) {
break;
}
uint8_t y = data[i_in++];
// combine them into a 16-bit value
uint16_t xy = (((uint16_t)x << 8) | (uint16_t)y);
// give the 16-bit value to the UTF-16 decoder and maybe
// receive a Unicode character back
uint32_t ch;
if (!Utf16Decoder_Input(&decoder, xy, &ch)) {
continue;
}
if (!error) {
// encode the Unicode character back into UTF-16
uint16_t chenc[2];
int chenc_n = Utf16Encoder_EncodeCharacter(ch, chenc);
ASSERT(chenc_n > 0)
// match the result with input
for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
uint8_t cx = (chenc[chenc_i] >> 8);
uint8_t cy = (chenc[chenc_i] & 0xFF);
if (i_ch >= data_len || data[i_ch] != cx) {
error = 1;
break;
}
i_ch++;
if (i_ch >= data_len || data[i_ch] != cy) {
error = 1;
break;
}
i_ch++;
}
}
// we don't like null Unicode characters because we're building a
// null-terminated UTF-8 string
if (ch == 0) {
error = 1;
continue;
}
// encode the Unicode character into UTF-8
uint8_t enc[5];
int enc_n = Utf8Encoder_EncodeCharacter(ch, enc);
ASSERT(enc_n > 0)
// append the resulting UTF-8 bytes to the result string
enc[enc_n] = 0;
if (!ExpString_Append(&str, enc)) {
goto fail1;
}
}
// check if we matched the whole input string when encoding back
if (i_ch < data_len) {
error = 1;
}
if (out_is_error) {
*out_is_error = error;
}
return ExpString_Get(&str);
fail1:
ExpString_Free(&str);
fail0:
return NULL;
}
static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error)
{
Utf8Decoder decoder;
Utf8Decoder_Init(&decoder);
size_t i_in = 0;
size_t i_ch = 0;
bsize_t len = bsize_fromsize(0);
int error = 0;
while (i_in < data_len) {
uint8_t x = data[i_in++];
uint32_t ch;
if (!Utf8Decoder_Input(&decoder, x, &ch)) {
continue;
}
if (!error) {
uint8_t chenc[4];
int chenc_n = Utf8Encoder_EncodeCharacter(ch, chenc);
ASSERT(chenc_n > 0)
for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
if (i_ch >= data_len || data[i_ch] != chenc[chenc_i]) {
error = 1;
break;
}
i_ch++;
}
}
uint16_t enc[2];
int enc_n = Utf16Encoder_EncodeCharacter(ch, enc);
ASSERT(enc_n > 0)
len = bsize_add(len, bsize_fromsize(2 * enc_n));
for (int enc_i = 0; enc_i < enc_n; enc_i++) {
if (out_avail == 0) {
break;
}
*(out++) = (enc[enc_i] >> 8);
out_avail--;
if (out_avail == 0) {
break;
}
*(out++) = (enc[enc_i] & 0xFF);
out_avail--;
}
}
if (i_ch < data_len) {
error = 1;
}
if (out_len) {
*out_len = len;
}
if (out_is_error) {
*out_is_error = error;
}
}
#endif