Logo Search packages:      
Sourcecode: yiyantang version File versions  Download package

hzsegment.c

/************************************************************************** 
 * yyt - Psuedo tty converts amoung different Chinese encodings.
 *
 * Copyright (C) 2001 ha shao
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  
 *
 **************************************************************************/

/* Return segment containing only ascii or double byte hanzi. 
 * Otherwise, we cannot guess encoding correctly cuz we read in
 * the unit of BUFFSIZE. The price paid to auto-detect encoding.
 * There are no line buffered I/O. We need raw I/O. We will see 
 * how far it can go.
 * */

/* We don't do utf8 encoding yet. */

#include <ctype.h>

#include "hzsegment.h"

#define testascii(x)    (x <= 0x80)
#define ishz1(x)  ((x >= 0x81) && (x <= 0xFE))
#define ishz2(x)  (((x >= 0x40) && (x <= 0x7E)) || ((x >= 0xA1) && (x <= 0xFE)))

#if 0                   /* So GBK is worse than Big5. */
#define ishz2(x)  ((x >= 0x40) && (x <= 0x7E)) || ((x >= 0x80) && (x <= 0xFE))
#endif

/* Segment buf at hanzi boundary. Return the length of the found 
 * segment.
 */
int hzsegment (unsigned char *buf, int count)
{
      unsigned char *pbuf;
      int i;

      pbuf = buf;
      i = 0;

      /* Ascii 0-127 and not 2nd half big5 */
      if(testascii(pbuf[0]) && 
                  (!(ishz2(pbuf[0]) && ishz1(pbuf[1])))) {
            for (i = 0; i < count; i++)
            {
                  if (ishz1 (pbuf[i]))
                        return i;
            }

      }
      /* Double byte segment. */
      else
      {
            for (i = 1; i < count; i++)
            {
                  /* Don't let printable ascii break double byte 
                   * sequence (but we don't want to swallow newline.)
                   * Since for curses program, non-printable chars
                   * are used to move the cursor around.
                   * Then we can have longer hanzi string for more 
                   * accurate encode judgement. 
                   *
                   * Old GB locale might be a bit broken for isgraph(), :).
                   * Update your locale. */
                  if(testascii(pbuf[i]) && 
                              (!isgraph(pbuf[i])) &&
                              (pbuf[i] != 0x20)) /* space is already isgraph? */
                  {
                        if (!(ishz2 (pbuf[i]) && ishz1 (pbuf[i-1])))
                        {
                              return i;
                        } /* else keep going. */
                  }
            } /* for double byte */
      }
      return i;
}



/* vim: set ts=4:sw=4:  */

Generated by  Doxygen 1.6.0   Back to index