utftools.c

Publication 6 May 2016, written 15 December 2008 or before. Updated 28 September 2019, and February 2020.
Source code


/* Copyright 2008 by R. Harmsen.
   But I won't sue anyone who uses or adapts the code.

   28 September 2019: Added function CP1252_to_scalar()
*/

#include <string.h>

/* Usage instructions of each function are in the following
   include file */
#include "utftools.h"

/****************************************************************/
int utf8len (int firstbyte)
{
   if ((firstbyte & ~0x7f) == 0)
      return 1;
   if ((firstbyte & ~0x3f) == 0x80)
      return 0; /* Not the start of UTF8, but a follow-up byte */
   if ((firstbyte & ~0x1f) == 0xc0)
      return 2;
   if ((firstbyte & ~0x0f) == 0xe0)
      return 3;
   if ((firstbyte & ~0x07) == 0xf0)
      return 4;
   if ((firstbyte & ~0x03) == 0xf8)
      return 5;
   if ((firstbyte & ~0x01) == 0xFC)
      return 6;

   /* All possible 8-bit byte values containing at least one
      zero bit have been covered above. So we could only get
      here in case of 0xff. We deem that to be length 1 as well.
      */
   return 1;
}

/****************************************************************/
int utf8frst (int firstbyte)
{
   if ((firstbyte & ~0x7f) == 0)
      return 1;
   if ((firstbyte & ~0x3f) == 0x80)
      return 0; /* Not the start of UTF8, but a follow-up byte */
   if ((firstbyte & ~0x1f) == 0xc0)
   {
      /* 7 bits/1 byte ASCII could be encoded in 2 bytes (for 11 bits)
         too, but this is forbidden to avert spoofing. So bits 5 to 1
         (assuming bit 0 is rightmost) must no be all zero.
       */
      if ((firstbyte & 0x3e) == 0)
         return -1;
      return 2;
   }
   if ((firstbyte & ~0x0f) == 0xe0)
   {
      return 3;
   }
   if ((firstbyte & ~0x07) == 0xf0)
   {
      /* See http://www.unicode.org/L2/L2000/00374r2-short-utf8.htm and
         http://unicode.org/versions/corrigendum1.html
         "Table 3.1B. Legal UTF-8 Byte Sequences"
       */
      if (firstbyte > 0xf4)
          return -1;
      return 4;
   }

   /* Unicode currently only supports scalar of 21 bits maximum,
      encoded in 4 bytes. Everything else is, and probably will
      be, for ever, unsupported and therefore wrong. */
   if ((firstbyte & ~0x03) == 0xf8 ||
       (firstbyte & ~0x01) == 0xFC ||
       (firstbyte & ~0x00) == 0xFF)
      return -1;

   return -1;
}


/****************************************************************/
int utf8valid (unsigned char *buf)
{
   int i, max;

   max = utf8len(buf[0]);
   if (max > 6 || max <= 1)
      return 0;

   /* Note that simple Ascii and UTF8 follow-up chars are also
      considered non-valid. Only the start of a UTF8 sequence,
      including its follow-ups, is considered valid (and returns 1)
    */

   for (i = 1; i < max; i++)
   {
      if ((buf[i] & ~0x3f) != 0x80)
      {
         /* Not a UTF8 follow-up character, so the UTF8 sequence
            isn't valid */
         return 0;
      }
   }
   /* The character AFTER a valid UTF8, in a stream of UTF8 text,
      could be either ASCII (including a null byte) or the start
      of a new UTF8 char. It cannot be a UTF8 follow-up byte.
      However, because we want to be able to test also mixed text,
      possibly containing UTF8 and ISO-8859, this is not tested here!
      So this is intended behaviour!
    */

   /* No invalidity has been detected so far, so we assume a
      valid UTF8 sequence. */
   return 1;
}


/****************************************************************/
long utf2scalar (unsigned char *start)
{
   switch (utf8len(*start))
   {
       default:
       case 1:
          return *start;

       case 2:
          if (start[1])
          {
             return (((unsigned long)start[0] & 0x1f) << 6)
                  |  ((unsigned long)start[1] & 0x3f);
          }

       case 3:
          if (start[1] && start[2])
          {
             return (((unsigned long)start[0] & 0x0f) << 12)
                  | (((unsigned long)start[1] & 0x3f) << 6)
                  |  ((unsigned long)start[2] & 0x3f);
          }

       case 4:
          if (start[1] && start[2] && start[3])
          {
             return (((unsigned long)start[0] & 0x07) << 18)
                  | (((unsigned long)start[1] & 0x3f) << 12)
                  | (((unsigned long)start[2] & 0x3f) << 6)
                  |  ((unsigned long)start[3] & 0x3f);
          }

       case 5:
          if (start[1] && start[2] && start[3] && start[4])
          {
             return (((unsigned long)start[0] & 0x03) << 24)
                  | (((unsigned long)start[1] & 0x3f) << 18)
                  | (((unsigned long)start[2] & 0x3f) << 12)
                  | (((unsigned long)start[3] & 0x3f) << 6)
                  |  ((unsigned long)start[4] & 0x3f);
          }
       case 6:
          if (start[1] && start[2] && start[3] && start[4] && start[5])
          {
             return (((unsigned long)start[0] & 0x03) << 30)
                  | (((unsigned long)start[1] & 0x3f) << 24)
                  | (((unsigned long)start[2] & 0x3f) << 18)
                  | (((unsigned long)start[3] & 0x3f) << 12)
                  | (((unsigned long)start[4] & 0x3f) << 6)
                  |  ((unsigned long)start[5] & 0x3f);
          }

   }
   return *start;
}


/****************************************************************/
unsigned char *scalar2utf8 (long scalar)
{
   static unsigned char utf[7];

   memset(utf, '\0', sizeof utf);

   if (scalar <= 0x7f)             /*  7 bits */
   {
      utf[0] =          scalar        & 0x7f;
   }
   else if (scalar <= 0x7ffL)      /* 11 bits */
   {
      utf[0] = 0xc0 | ((scalar >>  6) & 0x1f);
      utf[1] = 0x80 | ( scalar        & 0x3f);
   }
   else if (scalar <= 0xffffL)     /* 16 bits */
   {
      utf[0] = 0xe0 | ((scalar >> 12) & 0x0f);
      utf[1] = 0x80 | ((scalar >>  6) & 0x3f);
      utf[2] = 0x80 | ( scalar        & 0x3f);
   }
   else if (scalar <= 0x1fffffL)   /* 21 bits */
   {
      utf[0] = 0xf0 | ((scalar >> 18) & 0x07);
      utf[1] = 0x80 | ((scalar >> 12) & 0x3f);
      utf[2] = 0x80 | ((scalar >>  6) & 0x3f);
      utf[3] = 0x80 | ( scalar        & 0x3f);
   }
   else if (scalar <= 0x3ffffffL)  /* 26 bits */
   {
      utf[0] = 0xf8 | ((scalar >> 24) & 0x03);
      utf[1] = 0x80 | ((scalar >> 18) & 0x3f);
      utf[2] = 0x80 | ((scalar >> 12) & 0x3f);
      utf[3] = 0x80 | ((scalar >>  6) & 0x3f);
      utf[4] = 0x80 | ( scalar        & 0x3f);
   }
   else if (scalar <= 0x7fffffffL) /* 31 bits */
   {
      utf[0] = 0xfc | ((scalar >> 30) & 0x01);
      utf[1] = 0x80 | ((scalar >> 24) & 0x3f);
      utf[2] = 0x80 | ((scalar >> 18) & 0x3f);
      utf[3] = 0x80 | ((scalar >> 12) & 0x3f);
      utf[4] = 0x80 | ((scalar >>  6) & 0x3f);
      utf[5] = 0x80 | ( scalar        & 0x3f);
   }

   return utf;
}


/****************************************************************/
int isconvertibleISO8859_1 (long scalar)
{
   if (scalar < 0x80)
      return 1;

   if (scalar >= 0xa0 && scalar <= 0xff)
      return 1;

   return 0;
}


/****************************************************************/
int isconvertibleCP1252 (long scalar, int *p_converted)
{
   int canbe = 0;
   long converted;

   if (isconvertibleISO8859_1(scalar))
   {
      converted = scalar;
      canbe = 1;
   }
   else
   {
      converted = convert2CP1252(scalar);

      if (converted != -1)
      {
         canbe = 1;
      }
   }
   if (canbe)
   {
      /* Optional return of conversion value, only if a valid
         pointer was passed in the call to the function. */
      if (p_converted)
          *p_converted = converted;
   }

   return canbe;
}


/****************************************************************/
static long CP1252_conversiontable[] =
{
   0x20ac,  /* 0x80 */
   -1,      /* 0x81 */
   0x201a,  /* 0x82 */
   0x0192,  /* 0x83 */
   0x201e,  /* 0x84 */
   0x2026,  /* 0x85 */
   0x2020,  /* 0x86 */
   0x2021,  /* 0x87 */
   0x02c6,  /* 0x88 */
   0x2030,  /* 0x89 */
   0x0160,  /* 0x8a */
   0x2039,  /* 0x8b */
   0x0152,  /* 0x8c */
   -1,      /* 0x8d */
   0x017d,  /* 0x8e */
   -1,      /* 0x8f */

   -1,      /* 0x90 */
   0x2018,  /* 0x91 */
   0x2019,  /* 0x92 */
   0x201c,  /* 0x93 */
   0x201d,  /* 0x94 */
   0x2022,  /* 0x95 */
   0x2013,  /* 0x96 */
   0x2014,  /* 0x97 */
   0x02dc,  /* 0x98 */
   0x2122,  /* 0x99 */
   0x0161,  /* 0x9a */
   0x203a,  /* 0x9b */
   0x0153,  /* 0x9c */
   -1,      /* 0x9d */
   0x0017e, /* 0x9e */
   0x00178, /* 0x9f */
};
#define numelt_CP1252_conversiontable \
   (sizeof CP1252_conversiontable) / \
   (sizeof CP1252_conversiontable[0])


/****************************************************************/
int convert2CP1252 (long scalar)
{
   int i;

   if (scalar == -1)
      return -1;

   /* Linear search, could have used lsearch but this is probably
      almost as fast, and simpler. */
   for (i = 0; i < numelt_CP1252_conversiontable; i++)
   {
      if (CP1252_conversiontable[i] == scalar)
         return i + 0x80;
   }
   return -1;
}

/****************************************************************/
long CP1252_to_scalar (int CP1252)
{
   int i;

   if (isconvertibleISO8859_1(CP1252))
      return CP1252;

   i = CP1252 - 0x80;
   if (i > sizeof CP1252_conversiontable / sizeof CP1252_conversiontable[0])
      return -1;
   else
      return CP1252_conversiontable[i];
}

/****************************************************************/

#ifdef Main
#include <stdio.h>
int main (int argc, char argv[])
{
   long l, after;
   unsigned char *utf;

   for (l = 0x80; l < 100000000L; l++)
   {
      utf = scalar2utf8(l);
      if (!utf8valid(utf))
      {
         printf("Invalid UTF at %lx\n", l);
      }
      else if ((after = utf2scalar(utf)) != l)
      {
         printf("%ld\n", l);
         printf("%ld\n", after);
         printf("%ld appears as %ld\n", l, after);
      }
   }

   return 0;
}

#endif