Logo Search packages:      
Sourcecode: aegis version File versions

wstr.c

/*
 *    aegis - project change supervisor
 *    Copyright (C) 1995, 1996, 1998, 1999, 2002 Peter Miller;
 *    All rights reserved.
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
 *
 *
 * MANIFEST: wide string manipulation functions
 *
 * A literal pool is maintained.  Each string has a reference count.  The
 * string stays in the literal pool for as long as it has a positive
 * reference count.  To determine if a string is already in the literal pool,
 * linear dynamic hashing is used to guarantee an O(1) search.  Making all equal
 * strings the same item in the literal pool means that string equality is
 * a pointer test, and thus very fast.
 */

#include <ac/stddef.h>
#include <ac/stdio.h>
#include <ac/stdlib.h>
#include <ac/string.h>
#include <ac/stdarg.h>
#include <ac/wchar.h>
#include <ac/wctype.h>

#include <error.h>
#include <language.h>
#include <mem.h>
#include <str.h>
#include <wstr.h>


static wstring_ty **hash_table;
static wstr_hash_ty hash_modulus;
static wstr_hash_ty hash_cutover;
static wstr_hash_ty hash_cutover_mask;
static wstr_hash_ty hash_cutover_split_mask;
static wstr_hash_ty hash_split;
static wstr_hash_ty hash_load;
static int      changed;

#define MAX_HASH_LEN 20


/*
 * NAME
 *    hash_generate - hash string to number
 *
 * SYNOPSIS
 *    wstr_hash_ty hash_generate(wchar_t *s, size_t n);
 *
 * DESCRIPTION
 *    The hash_generate function is used to make a number from a string.
 *
 * RETURNS
 *    wstr_hash_ty - the magic number
 *
 * CAVEAT
 *    Only the last MAX_HASH_LEN characters are used.
 *    It is important that wstr_hash_ty be unsigned (int or long).
 */

static wstr_hash_ty
hash_generate(const wchar_t *s, size_t n)
{
    wstr_hash_ty    retval;

    if (n > MAX_HASH_LEN)
    {
      s += n - MAX_HASH_LEN;
      n = MAX_HASH_LEN;
    }

    retval = 0;
    while (n > 0)
    {
      retval = (retval + (retval << 1)) ^ *s++;
      --n;
    }
    return retval;
}


/*
 * NAME
 *    wstr_initialize - start up string table
 *
 * SYNOPSIS
 *    void wstr_initialize(void);
 *
 * DESCRIPTION
 *    The wstr_initialize function is used to create the hash table and
 *    initialize it to empty.
 *
 * RETURNS
 *    void
 *
 * CAVEAT
 *    This function must be called before any other defined in this file.
 */

static void
wstr_initialize(void)
{
    wstr_hash_ty    j;

    if (hash_modulus)
      return;
    hash_modulus = 1 << 8;    /* MUST be a power of 2 */
    hash_cutover = hash_modulus;
    hash_split = hash_modulus - hash_cutover;
    hash_cutover_mask = hash_cutover - 1;
    hash_cutover_split_mask = (hash_cutover * 2) - 1;
    hash_load = 0;
    hash_table = (wstring_ty **)mem_alloc(hash_modulus * sizeof(wstring_ty *));
    for (j = 0; j < hash_modulus; ++j)
      hash_table[j] = 0;
}


/*
 * NAME
 *    split - reduce table loading
 *
 * SYNOPSIS
 *    void split(void);
 *
 * DESCRIPTION
 *    The split function is used to reduce the load factor on the hash table.
 *
 * RETURNS
 *    void
 *
 * CAVEAT
 *    A load factor of about 80% is suggested.
 */

static void
split(void)
{
    wstring_ty      *p;
    wstring_ty      *p2;
    wstr_hash_ty    idx;

    /*
     * get the list to be split across buckets
     */
    p = hash_table[hash_split];
    hash_table[hash_split] = 0;

    /*
     * increase the modulus by one
     */
    hash_modulus++;
    hash_table =
      mem_change_size(hash_table, hash_modulus * sizeof(wstring_ty *));
    hash_table[hash_modulus - 1] = 0;
    hash_split = hash_modulus - hash_cutover;
    if (hash_split >= hash_cutover)
    {
      hash_cutover = hash_modulus;
      hash_split = 0;
      hash_cutover_mask = hash_cutover - 1;
      hash_cutover_split_mask = (hash_cutover * 2) - 1;
    }

    /*
     * now redistribute the list elements
     */
    while (p)
    {
      p2 = p;
      p = p->wstr_next;

      idx = p2->wstr_hash & hash_cutover_mask;
      if (idx < hash_split)
          idx = p2->wstr_hash & hash_cutover_split_mask;
      p2->wstr_next = hash_table[idx];
      hash_table[idx] = p2;
    }
}


/*
 * NAME
 *    wstr_from_c - make string from C string
 *
 * SYNOPSIS
 *    wstring_ty *wstr_from_c(char *);
 *
 * DESCRIPTION
 *    The wstr_from_c function is used to make a string from a NUL
 *    terminated C string.  The conversion from multi-byte to wide
 *    characters is done in the current locale.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.  Use
 *    wstr_free when finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_from_c(const char *s)
{
    return wstr_n_from_c(s, strlen(s));
}


/*
 * NAME
 *    wstr_from_wc - make string from a wide C string
 *
 * SYNOPSIS
 *    wstring_ty *wstr_from_wc(wchar_t *);
 *
 * DESCRIPTION
 *    The wstr_from_c function is used to make a string from a NUL
 *    terminated wide C string.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.  Use
 *    wstr_free when finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_from_wc(const wchar_t *ws)
{
    return wstr_n_from_wc(ws, wcslen(ws));
}


/*
 * NAME
 *    wstr_n_from_c - make string
 *
 * SYNOPSIS
 *    wstring_ty *wstr_n_from_c(char *s, size_t n);
 *
 * DESCRIPTION
 *    The wstr_n_from_c function is used to make a string from an
 *    array of characters.  No NUL terminator is assumed.  The
 *    conversion from muti-byte to wide characters is done in the
 *    current locale.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.  Use
 *    wstr_free when finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_n_from_c(const char *s, size_t length)
{
#if __STDC__ >= 1
    static char     escapes[] = "\aa\bb\ff\nn\rr\tt\vv";
#else
    static char     escapes[] = "\bb\ff\nn\rr\tt";
#endif
    static wchar_t  *buf;
    static size_t   bufmax;
    size_t          remainder;
    const char      *ip;
    wchar_t         *op;

    /*
     * Do the conversion "long hand".  This is because some
     * implementations of the mbstowcs function barf when they see
     * invalid multi byte character sequences.  This function
     * renders them as C escape sequences and keeps going.
     */
    if (bufmax < length)
    {
      bufmax = length;
      /* the 4 is the longest escape sequence */
      buf = mem_change_size(buf, bufmax * sizeof(wchar_t) * 4);
    }

    /*
     * change the locale to the native language default
     */
    language_human();

    /*
     * Reset the mbtowc internal state.
     */
    mbtowc((wchar_t *)0, (char *)0, 0);                      /*lint !e418 */

    /*
     * scan the string and extract the wide characters
     */
    ip = s;
    op = buf;
    remainder = length;
    while (remainder > 0)
    {
      int             n;

      n = mbtowc(op, ip, remainder);
      if (n == 0)
          break;
      if (n < 0)
      {
          char           *esc;

          /*
           * Invalid multi byte sequence, replace the
           * first character with a C escape sequence.
           */
          esc = strchr(escapes, *ip);
          if (esc)
          {
            *op++ = '\\';
            *op++ = esc[1];
          }
          else
          {
            *op++ = '\\';
            *op++ = '0' + ((*ip >> 6) & 7);
            *op++ = '0' + ((*ip >> 3) & 7);
            *op++ = '0' + (*ip & 7);
          }
          ++ip;
          --remainder;

          /*
           * The mbtowc function's internal state will now
           * be "error" or broken, or otherwise useless.
           * Reset it so that we can keep going.
           */
          mbtowc((wchar_t *)0, (char *)0, 0);   /*lint !e418 */
      }
      else
      {
          /*
           * the one wchar_t used n chars
           */
          ip += n;
          remainder -= n;
          ++op;
      }
    }

    /*
     * change the locale back to the C locale
     */
    language_C();

    /*
     * build the result from the image in ``buf''
     */
    return wstr_n_from_wc(buf, op - buf);
}


/*
 * NAME
 *    wstr_to_mbs - wide string to multi-byte C string
 *
 * SYNOPSIS
 *    void wstr_to_mbs(wstring_ty *s, char **rslt, size_t *rslt_len);
 *
 * DESCRIPTION
 *    The wstr_to_mbs function convers a wide character string into a
 *    multi-byte C string.  The conversion is done in the current
 *    locale.  The result is NUL terminated, however the result length
 *    does not include the NUL.
 *
 * CAVEAT
 *    DO NOT free the result.  The result will change between calls,
 *    so copy it if you need to keep it.
 */

void
wstr_to_mbs(const wstring_ty *s, char **result_p, size_t *result_length_p)
{
    static char     *buf;
    static size_t   bufmax;
    int             n;
    const wchar_t   *ip;
    size_t          remainder;
    char            *op;
    size_t          buflen;

    /*
     * For reasons I don't understand, the MB_CUR_MAX symbol (wich is
     * defined as a reference to __mb_cur_max) does not get resolved
     * at link time, despite being present in libc.  It's easier to
     * just dodge the question.
     */
#ifdef __CYGWIN__
#undef MB_CUR_MAX
#define MB_CUR_MAX 8
#endif

    /*
     * Do the conversion "long hand".  This is because the wcstombs
     * function barfs when it sees an invalid wchar_t.  This
     * function treats them literally and keeps going.
     */
    buflen = (s->wstr_length + 1) * MB_CUR_MAX;
    if (buflen < s->wstr_length + 1)
    {
      /*
       * There are some wonderfully brain-dead implementatiosn
       * out there.  The more stupid ones manage to make
       * MB_CUR_MAX be zero!
       */
      buflen = s->wstr_length + 1;
    }
    if (buflen > bufmax)
    {
      bufmax = buflen;
      buf = mem_change_size(buf, bufmax);
    }

    /*
     * perform the conversion in the native language default
     */
    language_human();

    /*
     * The wctomb function has internal state.  It needs to be reset.
     */
    wctomb((char *)0, 0);                              /*lint !e418 */

    ip = s->wstr_text;
    remainder = s->wstr_length;
    op = buf;
    while (remainder > 0)
    {
      n = wctomb(op, *ip);
      if (n <= 0)
      {
          /*
           * Copy the character literally.
           * Throw away anything that will not fit.
           */
          *op++ = *ip++;
          if (!op[-1])
            op[-1] = '?';
          --remainder;

          /*
           * The wctomb function's internal state will now
           * be "error" or broken, or otherwise useless.
           * Reset it so that we can keep going.
           */
          wctomb((char *)0, 0);                        /*lint !e418 */
      }
      else
      {
          op += n;
          ++ip;
          --remainder;
      }
    }

    /*
     * The final NUL could require shift state end characters,
     * meaning that n could be more than 1.
     */
    n = wctomb(op, (wchar_t)0);
    if (n <= 0)
      *op = 0;
    else
    {
      op += n - 1;
      assert(*op == 0);
    }

    /*
     * restore the locale to the C locale
     */
    language_C();

    /*
     * set the output side effects
     */
    *result_p = buf;
    *result_length_p = op - buf;
}


/*
 * NAME
 *    wstr_n_from_wc - make string
 *
 * SYNOPSIS
 *    wstring_ty *wstr_n_from_wc(wchar_t *s, size_t n);
 *
 * DESCRIPTION
 *    The wstr_n_from_c function is used to make a string from an
 *    array of wide characters.  No NUL terminator is assumed.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.  Use
 *    wstr_free when finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_n_from_wc(const wchar_t *s, size_t length)
{
    wstr_hash_ty    hash;
    wstr_hash_ty    idx;
    wstring_ty      *p;

    if (!hash_modulus)
      wstr_initialize();
    hash = hash_generate(s, length);

    idx = hash & hash_cutover_mask;
    if (idx < hash_split)
      idx = hash & hash_cutover_split_mask;

    for (p = hash_table[idx]; p; p = p->wstr_next)
    {
      if
      (
          p->wstr_hash == hash
      &&
          p->wstr_length == length
      &&
          !memcmp(p->wstr_text, s, length * sizeof(wchar_t))
      )
      {
          p->wstr_references++;
          return p;
      }
    }

    p = (wstring_ty *)mem_alloc(sizeof(wstring_ty) + length * sizeof(wchar_t));
    p->wstr_hash = hash;
    p->wstr_length = length;
    p->wstr_references = 1;
    p->wstr_next = hash_table[idx];
    hash_table[idx] = p;
    memcpy(p->wstr_text, s, length * sizeof(wchar_t));
    p->wstr_text[length] = 0;

    hash_load++;
    while (hash_load * 10 > hash_modulus * 8)
      split();
    ++changed;
    return p;
}


/*
 * NAME
 *    wstr_copy - make a copy of a string
 *
 * SYNOPSIS
 *    wstring_ty *wstr_copy(wstring_ty *s);
 *
 * DESCRIPTION
 *    The wstr_copy function is used to make a copy of a string.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.
 *    Use wstr_free when finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_copy(wstring_ty *s)
{
    s->wstr_references++;
    return s;
}


/*
 * NAME
 *    wstr_free - release a string
 *
 * SYNOPSIS
 *    void wstr_free(wstring_ty *s);
 *
 * DESCRIPTION
 *    The wstr_free function is used to indicate that a string hash been
 *    finished with.
 *
 * RETURNS
 *    void
 *
 * CAVEAT
 *    This is the only way to release strings DO NOT use the free function.
 */

void
wstr_free(wstring_ty *s)
{
    wstr_hash_ty    idx;
    wstring_ty      **spp;

    if (!s)
      return;
    if (s->wstr_references > 1)
    {
      s->wstr_references--;
      return;
    }
    ++changed;

    /*
     * find the hash bucket it was in,
     * and remove it
     */
    idx = s->wstr_hash & hash_cutover_mask;
    if (idx < hash_split)
      idx = s->wstr_hash & hash_cutover_split_mask;
    for (spp = &hash_table[idx]; *spp; spp = &(*spp)->wstr_next)
    {
      if (*spp == s)
      {
          *spp = s->wstr_next;
          free(s);
          --hash_load;
          return;
      }
    }

    /*
     * should never reach here!
     */
    fatal_raw("attempted to free non-existent wstring (bug)");
}


/*
 * NAME
 *    wstr_catenate - join two strings
 *
 * SYNOPSIS
 *    wstring_ty *wstr_catenate(wstring_ty *, wstring_ty *);
 *
 * DESCRIPTION
 *    The wstr_catenate function is used to concatenate two strings to form a
 *    new string.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.
 *    Use wstr_free when finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_catenate(const wstring_ty *s1, const wstring_ty *s2)
{
    static wchar_t  *tmp;
    static size_t   tmplen;
    wstring_ty      *s;
    size_t          length;

    length = s1->wstr_length + s2->wstr_length;
    if (length > tmplen)
    {
      tmplen = length;
      tmp = mem_change_size(tmp, tmplen * sizeof(wchar_t));
    }
    memcpy(tmp, s1->wstr_text, s1->wstr_length * sizeof(wchar_t));
    memcpy
    (
      tmp + s1->wstr_length,
      s2->wstr_text,
      s2->wstr_length * sizeof(wchar_t)
    );
    s = wstr_n_from_wc(tmp, length);
    return s;
}


/*
 * NAME
 *    wstr_cat_three - join three strings
 *
 * SYNOPSIS
 *    wstring_ty *wstr_cat_three(wstring_ty *, wstring_ty *, wstring_ty *);
 *
 * DESCRIPTION
 *    The wstr_cat_three function is used to concatenate three strings to form
 *    a new string.
 *
 * RETURNS
 *    wstring_ty* - a pointer to a string in dynamic memory.
 *    Use wstr_free when finished with.
 *
 * CAVEAT
 *    The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_cat_three(const wstring_ty *s1, const wstring_ty *s2, const wstring_ty *s3)
{
    static wchar_t  *tmp;
    static size_t   tmplen;
    wstring_ty      *s;
    size_t          length;

    length = s1->wstr_length + s2->wstr_length + s3->wstr_length;
    if (tmplen < length)
    {
      tmplen = length;
      tmp = mem_change_size(tmp, tmplen * sizeof(wchar_t));
    }
    memcpy(tmp, s1->wstr_text, s1->wstr_length * sizeof(wchar_t));
    memcpy
    (
      tmp + s1->wstr_length,
      s2->wstr_text,
      s2->wstr_length * sizeof(wchar_t)
    );
    memcpy
    (
      tmp + s1->wstr_length + s2->wstr_length,
      s3->wstr_text,
      s3->wstr_length * sizeof(wchar_t)
    );
    s = wstr_n_from_wc(tmp, length);
    return s;
}


wstring_ty *
wstr_capitalize(const wstring_ty *ws)
{
    static wchar_t  *buffer;
    static size_t   buflen;
    size_t          j;
    int             prev_was_alpha;

    if (ws->wstr_length > buflen)
    {
      buflen = ws->wstr_length;
      buffer = mem_change_size(buffer, buflen * sizeof(wchar_t));
    }
    language_human();
    prev_was_alpha = 0;
    for (j = 0; j < ws->wstr_length; ++j)
    {
      wchar_t         c;

      c = ws->wstr_text[j];
      if (iswlower(c))
      {
          if (!prev_was_alpha)
            c = towupper(c);
          prev_was_alpha = 1;
      }
      else if (iswupper(c))
      {
          if (prev_was_alpha)
            c = towlower(c);
          prev_was_alpha = 1;
      }
      else
          prev_was_alpha = 0;
      buffer[j] = c;
    }
    language_C();
    return wstr_n_from_wc(buffer, ws->wstr_length);
}


wstring_ty *
wstr_to_upper(const wstring_ty *ws)
{
    static wchar_t  *buffer;
    static size_t   buflen;
    size_t          j;

    if (ws->wstr_length > buflen)
    {
      buflen = ws->wstr_length;
      buffer = mem_change_size(buffer, buflen * sizeof(wchar_t));
    }
    language_human();
    for (j = 0; j < ws->wstr_length; ++j)
    {
      wchar_t         c;

      c = ws->wstr_text[j];
      if (iswlower(c))
          c = towupper(c);
      buffer[j] = c;
    }
    language_C();
    return wstr_n_from_wc(buffer, ws->wstr_length);
}


wstring_ty *
wstr_to_lower(const wstring_ty *ws)
{
    static wchar_t  *buffer;
    static size_t   buflen;
    size_t          j;

    if (ws->wstr_length > buflen)
    {
      buflen = ws->wstr_length;
      buffer = mem_change_size(buffer, buflen * sizeof(wchar_t));
    }
    language_human();
    for (j = 0; j < ws->wstr_length; ++j)
    {
      wchar_t         c;

      c = ws->wstr_text[j];
      if (iswupper(c))
          c = towlower(c);
      buffer[j] = c;
    }
    language_C();
    return wstr_n_from_wc(buffer, ws->wstr_length);
}


wstring_ty *
wstr_to_ident(const wstring_ty *ws)
{
    static wchar_t  *buffer;
    static size_t   buflen;
    size_t          j;

    if (ws->wstr_length == 0)
      return wstr_from_c("_");
    if (ws->wstr_length > buflen)
    {
      buflen = ws->wstr_length;
      buffer = mem_change_size(buffer, buflen * sizeof(wchar_t));
    }
    language_human();
    for (j = 0; j < ws->wstr_length; ++j)
    {
      wchar_t         c;

      c = ws->wstr_text[j];
      if (!iswalnum(c))
          c = '_';
      buffer[j] = c;
    }
    if (iswdigit(buffer[0]))
      buffer[0] = '_';
    language_C();
    return wstr_n_from_wc(buffer, ws->wstr_length);
}


/*
 * NAME
 *    wstr_equal - test equality of strings
 *
 * SYNOPSIS
 *    int wstr_equal(wstring_ty *, wstring_ty *);
 *
 * DESCRIPTION
 *    The wstr_equal function is used to test if two strings are equal.
 *
 * RETURNS
 *    int; zero if the strings are not equal, nonzero if the strings are
 *    equal.
 *
 * CAVEAT
 *    This function is implemented as a macro in strings.h
 */

#ifndef wstr_equal

int
wstr_equal(const wstring_ty *s1, const wstring_ty *s2)
{
    return (s1 == s2);
}

#endif


wstring_ty *
str_to_wstr(const string_ty *s)
{
    return wstr_n_from_c(s->str_text, s->str_length);
}


string_ty *
wstr_to_str(const wstring_ty *ws)
{
    char            *text;
    size_t          length;

    wstr_to_mbs(ws, &text, &length);
    return str_n_from_c(text, length);
}

Generated by  Doxygen 1.6.0   Back to index