area51/Support/NetworkMgr/GameSpy/stringutil.c
Andrew Sampson 431f72b93a source
2021-08-27 19:22:41 -07:00

648 lines
20 KiB
C

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Conversion Utility for ASCII, UTF8 and USC2 (Unicode) character sets
//
// See RFC2279 for reference
//
//
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
#include "nonport.h"
#include "stringutil.h"
#ifdef __cplusplus
extern "C" {
#endif
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Reads UCS2 character from UTF8String
//
// [in] theUTF8String : UTF8String, doesn't need to be null terminated
// [out] theUCS2Char : The 2 byte UCS2 equivalent
//
// return value : The number of bytes read from theUTF8String
//
// Remarks:
// If theUTF8String is invalid, theUnicodeChar will be set to '?'
// Function is designed for convenient parsing of UTF8 data streams
//
// Security Concern:
// Because data is routed through an ASCII stream prior to this function being
// called, embedded NULLs are stripped and hence, this function does not check for them
// For example, the UTF-8 byte :1000 0000, would convert to a UCS2 NULL character
// If this appeared in the middle of a stream, it could cause undesired operation
int _ReadUCS2CharFromUTF8String(const UTF8String theUTF8String, UCS2Char* theUnicodeChar)
{
#ifndef _PS2
assert(theUnicodeChar != NULL);
#endif
// Check for normal ascii range (includes NULL terminator)
if (UTF8_IS_SINGLE_BYTE(theUTF8String[0]))
{
// ASCII, just copy the value
*theUnicodeChar = (UCS2Char)theUTF8String[0];
return 1;
}
// Check for 2 byte UTF8
else if (UTF8_IS_TWO_BYTE(theUTF8String[0]))
{
// Make sure the second byte is valid
if (UTF8_IS_FOLLOW_BYTE(theUTF8String[1]))
{
// Construct 11 bit unicode character
// 5 value bits from first UTF8Byte (:000ABCDE)
// plus 6 value bits from the second UTF8Byte (:00FGHIJK)
// Store as (:0000 0ABC DEFG HIJK)
*theUnicodeChar = (unsigned short)(((theUTF8String[0] & UTF8_TWO_BYTE_MASK) << 6) +
((theUTF8String[1] & UTF8_FOLLOW_BYTE_MASK)));
return 2;
}
}
// Check for 3 byte UTF8
else if (UTF8_IS_THREE_BYTE(theUTF8String[0]))
{
// Make sure the second and third bytes are valid
if (UTF8_IS_FOLLOW_BYTE(theUTF8String[1]) &&
UTF8_IS_FOLLOW_BYTE(theUTF8String[2]))
{
// Construct 16 bit unicode character
// 4 value bits from first UTF8Byte (:0000ABCD)
// plus 6 value bits from the second UTF8Byte (:00EFGHIJ)
// plus 6 value bits from the third UTF8Byte (:00KLMNOP)
// Store as (:ABCD EFGH IJKL MNOP)
*theUnicodeChar = (unsigned short)(((theUTF8String[0] & UTF8_THREE_BYTE_MASK) << 12) +
((theUTF8String[1] & UTF8_FOLLOW_BYTE_MASK) << 6) +
((theUTF8String[2] & UTF8_FOLLOW_BYTE_MASK)));
return 3;
}
}
// Invalid character, replace with '?' and return false
*theUnicodeChar = (UCS2Char)REPLACE_INVALID_CHAR;
// The second byte on could have been the start of a new valid UTF8 character
// so we can only safely discard one invalid character
return 1;
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Converts UCS2 (Unicode) character into UTF8String
//
// [in] theUCS2Char : The 2 byte character to convert
// [out] theUTF8String : The 1-3 byte UTF8 equivalent
//
// return value : The length of theUTF8String in bytes
//
// Remarks:
// theUTF8String may be up to 3 bytes, caller is responsible for allocating memory
// theUTF8String is NOT NULL terminated,
int _UCS2CharToUTF8String(UCS2Char theUCS2Char, UTF8String theUTF8String)
{
#ifndef _PS2
assert(theUTF8String != NULL);
#endif
// Screen out simple ascii (includes NULL terminator)
if (theUCS2Char <= 0x7F)
{
// 0-7 bit unicode, copy stright over
theUTF8String[0] = (char)(UTF8ByteType)theUCS2Char;
return 1;
}
else if (theUCS2Char <= 0x07FF)
{
// 8-11 bits unicode, store as two byte UTF8
// :00000ABC DEFGHIJK
// :110ABCDE 10FGHIJK
theUTF8String[0] = (char)(UTF8ByteType)(UTF8_TWO_BYTE_TAG | (theUCS2Char >> 6)); // Store the upper 5/11 bits as 0x110xxxxx
theUTF8String[1] = (char)(UTF8ByteType)(UTF8_FOLLOW_BYTE_TAG | (theUCS2Char & UTF8_FOLLOW_BYTE_MASK)); // Store the lower 6 bits as 0x10xxxxxx
return 2;
}
else
{
// 12-16 bits unicode, store as three byte UTF8
// :ABCDEFGH IJKLMNOP
// :1110ABCD 10EFGHIJ 10KLMNOP
theUTF8String[0] = (char)(UTF8ByteType)(UTF8_THREE_BYTE_TAG | (theUCS2Char >> 12)); // Store the upper 4/16 bits as 0x1110xxxx
theUTF8String[1] = (char)(UTF8ByteType)(UTF8_FOLLOW_BYTE_TAG | ((theUCS2Char >> 6) & UTF8_FOLLOW_BYTE_MASK)); // Store the 5th-10th bits as 0x10xxxxxx
theUTF8String[2] = (char)(UTF8ByteType)(UTF8_FOLLOW_BYTE_TAG | ((theUCS2Char) & UTF8_FOLLOW_BYTE_MASK)); // Store the last 6 bits as 0x10xxxxxx
return 3;
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Convert an ASCII string to UTF8
//
// Since an ASCII string IS a valid UTF8 string, just copy and return
//
// [in] theAsciiString, NULL terminated c-string
// [out] theUTF8String, NULL terminated UTF8String
//
// returns the length of theUTF8String
int AsciiToUTF8String(const char* theAsciiString, UTF8String theUTF8String)
{
// Allow for NULL here since SDKs allow for NULL string arrays
if (theAsciiString == NULL)
{
*theUTF8String = 0x00;
return 1;
}
else
{
// Copy the string, keeping track of length
unsigned int aLength = 0;
while (*theAsciiString != '\0')
{
*(theUTF8String++) = *(theAsciiString++);
aLength++;
}
// Append the null
*theUTF8String = '\0';
aLength++;
return (int)aLength;
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Convert a UTF8String to it's ASCII equivalent
//
// [in] theUTF8String, NULL terminated UTF8String
// [out] theAsciiString, NULL terminated c-string
//
// returns the length of theAsciiString
//
// Remarks:
// Unvalid ASCII characters are replaced with '?'
// Memory allocated for theAsciiString may need to be as large as the UTF8String
// UTF8String will be NULL terminated
int UTF8ToAsciiString(const UTF8String theUTF8String, char* theAsciiString)
{
// Strip non-ascii characters and replace with REPLACE_INVALID_CHAR
const unsigned char* anInStream = (const unsigned char*)theUTF8String;
unsigned int aNumBytesWritten = 0;
// Allow for NULL here since SDKs allow for NULL string arrays
if (theUTF8String == NULL)
{
*theAsciiString = 0x00;
return 1;
}
// Keep extracting characters until we get a '\0'
while (*anInStream != '\0')
{
if (UTF8_IS_SINGLE_BYTE(*anInStream))
theAsciiString[aNumBytesWritten++] = (char)*anInStream;
else
theAsciiString[aNumBytesWritten++] = REPLACE_INVALID_CHAR;
// move to next character
anInStream++;
}
// Append the '\0'
theAsciiString[aNumBytesWritten++] = '\0';
return (int)aNumBytesWritten;
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Convert a UCS2 (Unicode) string to it's UTF8 equivalent
//
// [in] theUCS2String, double NULL terminated UTF8String
// [out] theUTF8String, NULL terminated c-string
//
// returns the length of theUTF8String
//
// Remarks:
// Memory allocated for theUTF8String may need to be up to 1.5* the size of theUCS2String
// This means that for each UCS2 character, 3 UTF8 characters may be generated
int UCS2ToUTF8String(const UCS2String theUCS2String, UTF8String theUTF8String)
{
unsigned int aTotalBytesWritten = 0;
unsigned int aUTF8CharLength = 0;
const UCS2Char* anInStream = theUCS2String;
unsigned char* anOutStream = (unsigned char*)theUTF8String;
// Allow for NULL here since SDKs allow for NULL string parameters
if (theUCS2String == NULL)
{
*anOutStream = 0x00;
return 1;
}
// Loop until we reach a NULL terminator
while(*anInStream != 0)
{
aUTF8CharLength = (unsigned int)_UCS2CharToUTF8String(*anInStream, (UTF8String)anOutStream);
// Move out stream to next character position
anOutStream += aUTF8CharLength;
// Move to next UCS2 character
anInStream++;
// Record number of bytes written
aTotalBytesWritten += aUTF8CharLength;
}
// Copy over the null terminator
*anOutStream = '\0';
aTotalBytesWritten++;
return (int)aTotalBytesWritten;
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Convert a UTF8 string to it's UCS2 (Unicode) equivalent
//
// [in] theUTF8String, NULL terminated UTF8String
// [out] theUCS2String, NULL terminated c-string
//
// returns the length of theUCS2String
//
// Remarks:
// Unvalid UTF8 characters are replaced with '?'
// Memory allocated for theAsciiString may need to be as large as the UTF8String
// UTF8String will be NULL terminated
int UTF8ToUCS2String(const UTF8String theUTF8String, UCS2String theUCS2String)
{
int aNumCharsWritten = 0;
int aNumBytesRead = 0;
const unsigned char* anInStream = (const unsigned char*)theUTF8String;
UCS2Char* anOutStream= theUCS2String;
// Allow for NULL here since SDKs allow for NULL string arrays
if (theUTF8String == NULL)
{
*anOutStream = 0x0000;
return 1;
}
// Loop until we find the NULL terminator
while (*anInStream != '\0')
{
// Convert one character
aNumBytesRead = _ReadUCS2CharFromUTF8String((UTF8String)anInStream, anOutStream);
// Move InStream position to new data
anInStream += aNumBytesRead;
// Keep track of characters written
aNumCharsWritten++;
// Move OutStream to next write position
anOutStream++;
}
// NULL terminate the UCS2String
*anOutStream = 0x0000;
aNumCharsWritten++;
return aNumCharsWritten;
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Calculate the size needed to convert a UTF8String to a UCS2String
//
// [in] theUTF8String, NULL terminated UTF8String
//
// returns the length (in UCS2 characters) of theUCS2String that would be created
//
// Remarks:
// Unvalid UTF8 characters are treated as 1 byte
int _UTF8ToUCS2ConversionLengthOnly(const UTF8String theUTF8String)
{
int length = 0;
const UTF8String theReadPos = theUTF8String;
assert(theUTF8String != NULL);
if (theUTF8String == NULL)
return 0;
while (*theReadPos != '\0')
{
// Check for valid two byte string
if (UTF8_IS_TWO_BYTE(theReadPos[0]) && UTF8_IS_FOLLOW_BYTE(theReadPos[1]))
theReadPos += 2;
// Check for valid three byte string
else if (UTF8_IS_THREE_BYTE(theReadPos[0]) &&
UTF8_IS_FOLLOW_BYTE(theReadPos[1]) &&
UTF8_IS_FOLLOW_BYTE(theReadPos[2]))
{
theReadPos += 3;
}
// Anything else means one UTF8 character read from the buffer
else
theReadPos++;
// Increment the length of the UCS2 string
length++;
}
// don't count the null as a character, this conforms
// with ANSI strlen functions
return length;
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Calculate the size needed to convert a UCS2String to a UTF8String
//
// [in] theUCS2String, NULL terminated UCS2String
//
// returns the length of theUTF8String that would be created
//
// Remarks:
// Unvalid UTF8 characters are treated as 1 byte
int _UCS2ToUTF8ConversionLengthOnly(const UCS2String theUCS2String)
{
int length = 0;
const UCS2String theReadPos = theUCS2String;
assert(theUCS2String != NULL);
while (*theReadPos != 0x0000)
{
// Values <= 0x7F are single byte ascii
if (*theReadPos <= 0x7F)
length++;
// Values > 0x7F and <= 0x07FF are two bytes in UTF8
else if (*theReadPos <= 0x07FF)
length += 2;
// Anything else is 3 bytes of UTF8
else
length += 3;
// Set read pos to right spot (1 more UCS2 Character = 2 bytes)
theReadPos++;
}
// don't count the null as a character, this conforms
// with ANSI strlen functions
return length;
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Convert a UTF8String to a UCS2String, allocate space for the UCS2String
//
// [in] theUTF8String, NULL terminated UTF8String
//
// returns the newly allocated UCS2String
//
// Remarks:
// The callee is responsible for freeing the allocated memory block
UCS2String UTF8ToUCS2StringAlloc(const UTF8String theUTF8String)
{
// Allow for NULL here since SDKs allow for NULL string parameters
if (theUTF8String == NULL)
return NULL;
else
{
// Find the length of the UCS2 string and allocate a block
int newLength = _UTF8ToUCS2ConversionLengthOnly(theUTF8String);
UCS2String aUCS2String = (UCS2String)gsimalloc(sizeof(UCS2Char)*(newLength + 1));
// Do the conversion
UTF8ToUCS2String(theUTF8String, aUCS2String);
// Return the allocated string
return aUCS2String;
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Convert a UCS2String to a UTF8String, allocate space for the UTF8String
//
// [in] UCS2String, NULL terminated UCS2String
//
// returns the newly allocated UTF8String
//
// Remarks:
// The callee is responsible for freeing the allocated memory block
UTF8String UCS2ToUTF8StringAlloc(const UCS2String theUCS2String)
{
// Allow for NULL here since SDKs allow for NULL string parameters
if (theUCS2String == NULL)
return NULL;
else
{
// Find the length of the UCS2 string and allocate a block
int newLength = _UCS2ToUTF8ConversionLengthOnly(theUCS2String);
UTF8String aUTF8String = (UTF8String)gsimalloc(sizeof(char)*(newLength + 1));
// Do the conversion
UCS2ToUTF8String(theUCS2String, aUTF8String);
// Return the allocated string
return aUTF8String;
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Convert a UTF8StringArray to a UCS2StringArray, allocate space for the UCS2Strings
//
// [in] UTF8StringArray, array of NULL terminated UTF8Strings
// [in] theNumStrings, how many strings are in the array
//
// returns the newly allocated UCS2StringArray
//
// Remarks:
// The callee is responsible for freeing the allocated memory block(s)
UCS2String* UTF8ToUCS2StringArrayAlloc(const UTF8String* theUTF8StringArray, int theNumStrings)
{
// Allow for NULL here since SDKs allow for NULL string arrays
if(theUTF8StringArray == NULL || theNumStrings == 0)
return NULL;
else
{
UCS2String* aUCS2StringArray = (UCS2String*)gsimalloc(sizeof(UCS2String)*theNumStrings);
int stringNum = 0;
while(stringNum < theNumStrings)
aUCS2StringArray[stringNum++] = UTF8ToUCS2StringAlloc(theUTF8StringArray[stringNum]);
return aUCS2StringArray;
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Convert a UCS2StringArray to a UTF8StringArray, allocate space for the UTF8Strings
//
// [in] UCS2StringArray, array of NULL terminated UCS2Strings
// [in] theNumStrings, how many strings are in the array
//
// returns the newly allocated UTF8StringArray
//
// Remarks:
// The callee is responsible for freeing the allocated memory block
UTF8String* UCS2ToUTF8StringArrayAlloc(const UCS2String* theUCS2StringArray, int theNumStrings)
{
// Allow for NULL here since SDKs allow for NULL string arrays
if (theUCS2StringArray == NULL || theNumStrings == 0)
return NULL;
else
{
UTF8String* aUTF8StringArray = (UTF8String*)gsimalloc(sizeof(UTF8String)*theNumStrings);
int stringNum = 0;
while(stringNum < theNumStrings)
aUTF8StringArray[stringNum++] = UCS2ToUTF8StringAlloc(theUCS2StringArray[stringNum]);
return aUTF8StringArray;
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Convert a UCS2String to an AsciiString
//
// [in] UCS2StringArray, NULL terminated UCS2String
// [in/out] theAsciiString, ascii representation
//
// returns the length of the Ascii string
//
// Remarks:
// callee is responsible for allocating memory for theAsciiString
// Invalid ASCII characters are truncated
// The ASCII buffer must be at least 1/2 the size of the UCS2String
int UCS2ToAsciiString(const UCS2String theUCS2String, char* theAsciiString)
{
int length = 0;
const UCS2String aReadPos = theUCS2String;
char* aWritePos = theAsciiString;
assert(theAsciiString != NULL);
// Allow for NULL here since SDKs allow for NULL string arrays
if (theUCS2String == NULL)
{
*theAsciiString = '\0';
return 1;
}
// Convert each character until a '\0' is reached
while(*aReadPos != '\0')
{
(*aWritePos++) = (char)(0x00FF & (*aReadPos++));
length++;
}
// append the NULL
*aWritePos = '\0';
length++;
return length;
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Convert an ASCII string to a UCS2String
//
// [in] theAsciiString, NULL terminated ASCII string
// [in/out] theUCS2String, UCS2String to be filled with the converted ASCII
//
// returns the length of the unicode string
//
// Remarks:
// The callee is responsible for allocating memory for theUCS2String
// the size returned should always be 2x the size passed in
int AsciiToUCS2String(const char* theAsciiString, UCS2String theUCS2String)
{
int length = 0;
const char* aReadPos = theAsciiString;
UCS2String aWritePos = theUCS2String;
assert(theUCS2String != NULL);
// Allow for NULL here since SDKs allow for NULL string arrays
if (theAsciiString == NULL)
{
*theUCS2String = 0x0000;
return 1;
}
// Convert each character until a '\0' is reached
while(*aReadPos != '\0')
{
(*aWritePos++) = (unsigned short)(0x00FF & (*aReadPos++)); // copy and strip extra byte
length++;
}
// append a NULL terminator to the UCS2String
*aWritePos = '\0';
length++;
return length;
}
/*
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Convert a UCS2String to a UTF8String with a maximum length
//
// [in] theUCS2String, NULL terminated UCS2String
// [in/out] theUTF8String, The UTF8 equivilent of theUCS2String
// [in] theMaxLength, maximum number of UTF8 characters to write
//
// returns the length of the UTF8String
//
// Remarks:
// The length of theUTF8String will not exceed theMaxLength supplied.
int UCS2ToUTF8StringLength(const UCS2String theUCS2String, UTF8String theUTF8String, int theMaxLength)
{
return 0;
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Convert a UTF8String to a UCS2String with a maximum length
//
// [in] theUTF8String, NULL terminated UTF8String
// [in/out] theUCS2String, The UCS2 equivilent of theUTF8String
// [in] theMaxLength, maximum number of UTF8 characters to write
//
// returns the length of the UCS2String
//
// Remarks:
// The length of theUCS2String will not exceed theMaxLength supplied.
int UTF8ToUCS2StringLength(const UTF8String theUTF8String, UCS2String theUCS2String, int theMaxLength)
{
return 0;
}
*/
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
#ifdef __cplusplus
} //extern "C"
#endif