utftools.c
Publication , written 15 December 2008 or before. Updated 28 September 2019, and February 2020.
Sources belong to this explanation.
Source code
/* Copyright 2008 by R. Harmsen.
But I won't sue anyone who uses or adapts the code.
28 September 2019: Added function CP1252_to_scalar()
*/
#include <string.h>
/* Usage instructions of each function are in the following
include file */
#include "utftools.h"
/****************************************************************/
int utf8len (int firstbyte)
{
if ((firstbyte & ~0x7f) == 0)
return 1;
if ((firstbyte & ~0x3f) == 0x80)
return 0; /* Not the start of UTF8, but a follow-up byte */
if ((firstbyte & ~0x1f) == 0xc0)
return 2;
if ((firstbyte & ~0x0f) == 0xe0)
return 3;
if ((firstbyte & ~0x07) == 0xf0)
return 4;
if ((firstbyte & ~0x03) == 0xf8)
return 5;
if ((firstbyte & ~0x01) == 0xFC)
return 6;
/* All possible 8-bit byte values containing at least one
zero bit have been covered above. So we could only get
here in case of 0xff. We deem that to be length 1 as well.
*/
return 1;
}
/****************************************************************/
int utf8frst (int firstbyte)
{
if ((firstbyte & ~0x7f) == 0)
return 1;
if ((firstbyte & ~0x3f) == 0x80)
return 0; /* Not the start of UTF8, but a follow-up byte */
if ((firstbyte & ~0x1f) == 0xc0)
{
/* 7 bits/1 byte ASCII could be encoded in 2 bytes (for 11 bits)
too, but this is forbidden to avert spoofing. So bits 5 to 1
(assuming bit 0 is rightmost) must no be all zero.
*/
if ((firstbyte & 0x3e) == 0)
return -1;
return 2;
}
if ((firstbyte & ~0x0f) == 0xe0)
{
return 3;
}
if ((firstbyte & ~0x07) == 0xf0)
{
/* See http://www.unicode.org/L2/L2000/00374r2-short-utf8.htm and
http://unicode.org/versions/corrigendum1.html
"Table 3.1B. Legal UTF-8 Byte Sequences"
*/
if (firstbyte > 0xf4)
return -1;
return 4;
}
/* Unicode currently only supports scalar of 21 bits maximum,
encoded in 4 bytes. Everything else is, and probably will
be, for ever, unsupported and therefore wrong. */
if ((firstbyte & ~0x03) == 0xf8 ||
(firstbyte & ~0x01) == 0xFC ||
(firstbyte & ~0x00) == 0xFF)
return -1;
return -1;
}
/****************************************************************/
int utf8valid (unsigned char *buf)
{
int i, max;
max = utf8len(buf[0]);
if (max > 6 || max <= 1)
return 0;
/* Note that simple Ascii and UTF8 follow-up chars are also
considered non-valid. Only the start of a UTF8 sequence,
including its follow-ups, is considered valid (and returns 1)
*/
for (i = 1; i < max; i++)
{
if ((buf[i] & ~0x3f) != 0x80)
{
/* Not a UTF8 follow-up character, so the UTF8 sequence
isn't valid */
return 0;
}
}
/* The character AFTER a valid UTF8, in a stream of UTF8 text,
could be either ASCII (including a null byte) or the start
of a new UTF8 char. It cannot be a UTF8 follow-up byte.
However, because we want to be able to test also mixed text,
possibly containing UTF8 and ISO-8859, this is not tested here!
So this is intended behaviour!
*/
/* No invalidity has been detected so far, so we assume a
valid UTF8 sequence. */
return 1;
}
/****************************************************************/
long utf2scalar (unsigned char *start)
{
switch (utf8len(*start))
{
default:
case 1:
return *start;
case 2:
if (start[1])
{
return (((unsigned long)start[0] & 0x1f) << 6)
| ((unsigned long)start[1] & 0x3f);
}
case 3:
if (start[1] && start[2])
{
return (((unsigned long)start[0] & 0x0f) << 12)
| (((unsigned long)start[1] & 0x3f) << 6)
| ((unsigned long)start[2] & 0x3f);
}
case 4:
if (start[1] && start[2] && start[3])
{
return (((unsigned long)start[0] & 0x07) << 18)
| (((unsigned long)start[1] & 0x3f) << 12)
| (((unsigned long)start[2] & 0x3f) << 6)
| ((unsigned long)start[3] & 0x3f);
}
case 5:
if (start[1] && start[2] && start[3] && start[4])
{
return (((unsigned long)start[0] & 0x03) << 24)
| (((unsigned long)start[1] & 0x3f) << 18)
| (((unsigned long)start[2] & 0x3f) << 12)
| (((unsigned long)start[3] & 0x3f) << 6)
| ((unsigned long)start[4] & 0x3f);
}
case 6:
if (start[1] && start[2] && start[3] && start[4] && start[5])
{
return (((unsigned long)start[0] & 0x03) << 30)
| (((unsigned long)start[1] & 0x3f) << 24)
| (((unsigned long)start[2] & 0x3f) << 18)
| (((unsigned long)start[3] & 0x3f) << 12)
| (((unsigned long)start[4] & 0x3f) << 6)
| ((unsigned long)start[5] & 0x3f);
}
}
return *start;
}
/****************************************************************/
unsigned char *scalar2utf8 (long scalar)
{
static unsigned char utf[7];
memset(utf, '\0', sizeof utf);
if (scalar <= 0x7f) /* 7 bits */
{
utf[0] = scalar & 0x7f;
}
else if (scalar <= 0x7ffL) /* 11 bits */
{
utf[0] = 0xc0 | ((scalar >> 6) & 0x1f);
utf[1] = 0x80 | ( scalar & 0x3f);
}
else if (scalar <= 0xffffL) /* 16 bits */
{
utf[0] = 0xe0 | ((scalar >> 12) & 0x0f);
utf[1] = 0x80 | ((scalar >> 6) & 0x3f);
utf[2] = 0x80 | ( scalar & 0x3f);
}
else if (scalar <= 0x1fffffL) /* 21 bits */
{
utf[0] = 0xf0 | ((scalar >> 18) & 0x07);
utf[1] = 0x80 | ((scalar >> 12) & 0x3f);
utf[2] = 0x80 | ((scalar >> 6) & 0x3f);
utf[3] = 0x80 | ( scalar & 0x3f);
}
else if (scalar <= 0x3ffffffL) /* 26 bits */
{
utf[0] = 0xf8 | ((scalar >> 24) & 0x03);
utf[1] = 0x80 | ((scalar >> 18) & 0x3f);
utf[2] = 0x80 | ((scalar >> 12) & 0x3f);
utf[3] = 0x80 | ((scalar >> 6) & 0x3f);
utf[4] = 0x80 | ( scalar & 0x3f);
}
else if (scalar <= 0x7fffffffL) /* 31 bits */
{
utf[0] = 0xfc | ((scalar >> 30) & 0x01);
utf[1] = 0x80 | ((scalar >> 24) & 0x3f);
utf[2] = 0x80 | ((scalar >> 18) & 0x3f);
utf[3] = 0x80 | ((scalar >> 12) & 0x3f);
utf[4] = 0x80 | ((scalar >> 6) & 0x3f);
utf[5] = 0x80 | ( scalar & 0x3f);
}
return utf;
}
/****************************************************************/
int isconvertibleISO8859_1 (long scalar)
{
if (scalar < 0x80)
return 1;
if (scalar >= 0xa0 && scalar <= 0xff)
return 1;
return 0;
}
/****************************************************************/
int isconvertibleCP1252 (long scalar, int *p_converted)
{
int canbe = 0;
long converted;
if (isconvertibleISO8859_1(scalar))
{
converted = scalar;
canbe = 1;
}
else
{
converted = convert2CP1252(scalar);
if (converted != -1)
{
canbe = 1;
}
}
if (canbe)
{
/* Optional return of conversion value, only if a valid
pointer was passed in the call to the function. */
if (p_converted)
*p_converted = converted;
}
return canbe;
}
/****************************************************************/
static long CP1252_conversiontable[] =
{
0x20ac, /* 0x80 */
-1, /* 0x81 */
0x201a, /* 0x82 */
0x0192, /* 0x83 */
0x201e, /* 0x84 */
0x2026, /* 0x85 */
0x2020, /* 0x86 */
0x2021, /* 0x87 */
0x02c6, /* 0x88 */
0x2030, /* 0x89 */
0x0160, /* 0x8a */
0x2039, /* 0x8b */
0x0152, /* 0x8c */
-1, /* 0x8d */
0x017d, /* 0x8e */
-1, /* 0x8f */
-1, /* 0x90 */
0x2018, /* 0x91 */
0x2019, /* 0x92 */
0x201c, /* 0x93 */
0x201d, /* 0x94 */
0x2022, /* 0x95 */
0x2013, /* 0x96 */
0x2014, /* 0x97 */
0x02dc, /* 0x98 */
0x2122, /* 0x99 */
0x0161, /* 0x9a */
0x203a, /* 0x9b */
0x0153, /* 0x9c */
-1, /* 0x9d */
0x0017e, /* 0x9e */
0x00178, /* 0x9f */
};
#define numelt_CP1252_conversiontable \
(sizeof CP1252_conversiontable) / \
(sizeof CP1252_conversiontable[0])
/****************************************************************/
int convert2CP1252 (long scalar)
{
int i;
if (scalar == -1)
return -1;
/* Linear search, could have used lsearch but this is probably
almost as fast, and simpler. */
for (i = 0; i < numelt_CP1252_conversiontable; i++)
{
if (CP1252_conversiontable[i] == scalar)
return i + 0x80;
}
return -1;
}
/****************************************************************/
long CP1252_to_scalar (int CP1252)
{
int i;
if (isconvertibleISO8859_1(CP1252))
return CP1252;
i = CP1252 - 0x80;
if (i > sizeof CP1252_conversiontable / sizeof CP1252_conversiontable[0])
return -1;
else
return CP1252_conversiontable[i];
}
/****************************************************************/
#ifdef Main
#include <stdio.h>
int main (int argc, char argv[])
{
long l, after;
unsigned char *utf;
for (l = 0x80; l < 100000000L; l++)
{
utf = scalar2utf8(l);
if (!utf8valid(utf))
{
printf("Invalid UTF at %lx\n", l);
}
else if ((after = utf2scalar(utf)) != l)
{
printf("%ld\n", l);
printf("%ld\n", after);
printf("%ld appears as %ld\n", l, after);
}
}
return 0;
}
#endif