l3afpad/src/encoding.c

/*
 *  L3afpad - GTK+ based simple text editor
 *  Copyright (C) 2004-2005 Tarot Osuji
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

#include <glib.h>
#include <string.h>
#include "encoding.h"

#define MAX_COUNTRY_NUM 10

enum {
	LATIN1 = 0,
	LATIN2,
	LATIN3,
	LATIN4,
	LATINC,
	LATINC_UA,
	LATINC_TJ,
	LATINA,
	LATING,
	LATINH,
	LATIN5,
	CHINESE_CN,
	CHINESE_TW,
	CHINESE_HK,
	JAPANESE,
	KOREAN,
	VIETNAMESE,
	THAI,
	GEORGIAN,
	END_CODE
};

static const gchar *country_table[][MAX_COUNTRY_NUM] =
{
	/* LATIN1 */        {NULL},
	/* LATIN2 */        {"cs", "hr", "hu", "pl", "ro", "sk", "sl", "sq", "sr", "uz"},
	/* LATIN3 */        {"eo", "mt", NULL},
	/* LATIN4 */        {"et", "lt", "lv", "mi", NULL},
	/* LATINC */        {"be", "bg", "ky", "mk", "mn", "ru", "tt", NULL},
	/* LATINC_UA */     {"uk", NULL},
	/* LATINC_TJ */     {"tg", NULL},
	/* LATINA */        {"ar", "fa", "ur", NULL},
	/* LATING */        {"el", NULL},
	/* LATINH */        {"he", "yi", NULL},
	/* LATIN5 */        {"az", "tr", NULL},
	/* CHINESE_CN */    {"zh_CN", NULL},
	/* CHINESE_TW */    {"zh_TW", NULL},
	/* CHINESE_HK */    {"zh_HK", NULL},
	/* JAPANESE */      {"ja", NULL},
	/* KOREAN */        {"ko", NULL},
	/* VIETNAMESE */    {"vi", NULL},
	/* THAI */          {"th", NULL},
	/* GEORGIAN */      {"ka", NULL},
};

static const gchar *encoding_table[][ENCODING_MAX_ITEM_NUM] =
{
	                  /*  IANA           OpenI18N       Codepage */
	/* LATIN1 */        { "ISO-8859-1",  "ISO-8859-15", "CP1252" },
	/* LATIN2 */        { "ISO-8859-2",  "ISO-8859-16", "CP1250" },
	/* LATIN3 */        { "ISO-8859-3",  NULL,          NULL },
	/* LATIN4 */        { "ISO-8859-4",  "ISO-8859-13", "CP1257" },
	/* LATINC */        { "ISO-8859-5",  "KOI8-R",      "CP1251" },
	/* LATINC_UA */     { "ISO-8859-5",  "KOI8-U",      "CP1251" },
	/* LATINC_TJ */     { "ISO-8859-5",  "KOI8-T",      "CP1251" },
	/* LATINA */        { "ISO-8859-6",  NULL,          "CP1256" },
	/* LATING */        { "ISO-8859-7",  NULL,          "CP1253" },
	/* LATINH */        { "ISO-8859-8",  NULL,          "CP1255" },
	/* LATIN5 */        { "ISO-8859-9",  NULL,          "CP1254" },
	/* CHINESE_CN */    { "GB2312",      "GB18030",     "CP936" },
	/* CHINESE_TW */    { "BIG5",        "EUC-TW",      "CP950" },
	/* CHINESE_HK */    { "BIG5",        "BIG5-HKSCS",  "CP950" },
	/* JAPANESE */      { "ISO-2022-JP", "EUC-JP",      "CP932" },
	/* KOREAN */        { "ISO-2022-KR", "EUC-KR",      "CP949" },
	/* VIETNAMESE */    { NULL,          "VISCII",      "CP1258" },
	/* THAI */          { NULL,          "TIS-620",     "CP874" },
	/* GEORGIAN */      { NULL,          "GEORGIAN-PS", NULL },
};

guint get_encoding_code(void)
{
	static guint code = END_CODE;
	const gchar *env;
	guint i, j = 1;

	if (code == END_CODE) {
		env = g_getenv("LC_ALL");
		if (!env)
			env = g_getenv("LANG");
		if (env && strlen(env) >= 2)
			while (code == END_CODE && j < END_CODE) {
				for (i = 0; i < MAX_COUNTRY_NUM; i++) {
					if (!country_table[j][i])
						break;
					if (strncmp(env, country_table[j][i], strlen(country_table[j][i])) == 0) {
						code = j;
						break;
					}
				}
				j++;
			}
		if (code == END_CODE)
			code = 0;
	}

	return code;
}

EncArray *get_encoding_items(guint code)
{
	gint i;
	static EncArray *array = NULL;

	if (!array) {
		array = g_malloc(sizeof(EncArray));
		for (i = 0; i < ENCODING_MAX_ITEM_NUM; i++)
			array->item[i] = encoding_table[code][i] ?
				encoding_table[code][i] : NULL;
	}

	return array;
}

const gchar *get_default_charset(void)
{
	const gchar *charset;

	g_get_charset(&charset);

	return charset;
}

/*
Imported from KEdit (for BeOS, NOT KDE).
based on
http://examples.oreilly.com/cjkvinfo/Ch7/DetectCodeType.c
*/
void convert_line_ending_to_lf(gchar *text)
{
	gint i, j;

	for (i = 0, j = 0; TRUE; i++, j++) {
		if (*(text + i) == CR) {
			*(text + j) = LF;
			if (*(text + i + 1) == LF)
				i++;
		} else {
			*(text + j) = *(text + i);
			if (*(text + j) == '\0')
				break;
		}
	}
}

void convert_line_ending(gchar **text, gint retcode)
{
	gchar *buf, *str = *text;
	const gint len = strlen(str);
	gint i, j, LFNum = 0;

	switch (retcode) {
	case CR:
		while (*str != '\0') {
			if (*str == LF)
				*str = CR;
			str++;
		}
		break;
	case CR+LF:
		for (i = 0; *(str + i) != '\0'; i++) {
			if (*(str + i) == LF)
				LFNum++;
		}
		buf = g_new(gchar, len + LFNum + 1);
		for (i= 0, j = 0;; i++, j++) {
			if (*(str + j) == LF) {
				*(buf + i) = CR;
				*(buf + (++i)) = LF;
			} else
				*(buf + i) = *(str + j);
			if (*(str + j) == '\0')
				break;
		}
		g_free(*text);
		*text = buf;
	}
}

gint detect_line_ending(const gchar *text)
{
	while (*(text++) != '\0') {
		if (*text == LF)
			break;
		if (*text == CR) {
			if (*(++text) == LF)
				return CR+LF;
			else
				return CR;
		}
	}
	return LF;
}

static const gchar *detect_charset_cylillic(const gchar *text)
{
	guint8 c = *text;
	gboolean noniso = FALSE;
	guint32 xc = 0, xd = 0, xef = 0;

	const gchar *charset = get_encoding_items(get_encoding_code())->item[OPENI18N];

	while ((c = *text++) != '\0') {
		if (c >= 0x80 && c <= 0x9F)
			noniso = TRUE;
		else if (c >= 0xC0 && c <= 0xCF)
			xc++;
		else if (c >= 0xD0 && c <= 0xDF)
			xd++;
		else if (c >= 0xE0)
			xef++;
	}

	if (!noniso && ((xc + xef) < xd))
		charset = "ISO-8859-5";
	else if ((xc + xd) < xef)
		charset = "CP1251";

	return charset;
}

static const gchar *detect_charset_chinese(const gchar *text)
{
	guint8 c = *text;

	const gchar *charset = get_encoding_items(get_encoding_code())->item[IANA];

	while ((c = *text++) != '\0') {
		if (c >= 0x81 && c <= 0x87) {
			charset = "GB18030";
			break;
		}
		else if (c >= 0x88 && c <= 0xA0) {
			c = *text++;
			if ((c >= 0x30 && c <= 0x39) || (c >= 0x80 && c <= 0xA0)) {
				charset = "GB18030";
				break;
			} //else GBK/Big5-HKSCS cannot determine
		}
		else if ((c >= 0xA1 && c <= 0xC6) || (c >= 0xC9 && c <= 0xF9)) {
			c = *text++;
			if (c >= 0x40 && c <= 0x7E)
				charset = "BIG5";
			else if ((c >= 0x30 && c <= 0x39) || (c >= 0x80 && c <= 0xA0)) {
				charset = "GB18030";
				break;
			}
		}
		else if (c >= 0xC7) {
			c = *text++;
			if ((c >= 0x30 && c <= 0x39) || (c >= 0x80 && c <= 0xA0)) {
				charset = "GB18030";
				break;
			}
		}
	}

	return charset;
}

static const gchar *detect_charset_japanese(const gchar *text)
{
	guint8 c = *text;
	gchar *charset = NULL;

	while (charset == NULL && (c = *text++) != '\0') {
		if (c >= 0x81 && c <= 0x9F) {
			if (c == 0x8E) /* SS2 */ {
				c = *text++;
				if ((c >= 0x40 && c <= 0xA0) || (c >= 0xE0 && c <= 0xFC))
					charset = "CP932";
			}
			else if (c == 0x8F) /* SS3 */ {
				c = *text++;
				if (c >= 0x40 && c <= 0xA0)
					charset = "CP932";
				else if (c >= 0xFD)
					break;
			}
			else
				charset = "CP932";
		}
		else if (c >= 0xA1 && c <= 0xDF) {
			c = *text++;
			if (c <= 0x9F)
				charset = "CP932";
			else if (c >= 0xFD)
				break;
		}
		else if (c >= 0xE0 && c <= 0xEF) {
			c = *text++;
			if (c >= 0x40 && c <= 0xA0)
				charset = "CP932";
			else if (c >= 0xFD)
				break;
		}
		else if (c >= 0xF0)
			break;
	}

	if (charset == NULL)
		charset = "EUC-JP";

	return charset;
}

static const gchar *detect_charset_korean(const gchar *text)
{
	guint8 c = *text;
	gboolean noneuc = FALSE;
	gboolean nonjohab = FALSE;
	gchar *charset = NULL;

	while (charset == NULL && (c = *text++) != '\0') {
		if (c >= 0x81 && c < 0x84) {
			charset = "CP949";
		}
		else if (c >= 0x84 && c < 0xA1) {
			noneuc = TRUE;
			c = *text++;
			if ((c > 0x5A && c < 0x61) || (c > 0x7A && c < 0x81))
				charset = "CP1361";
			else if (c == 0x52 || c == 0x72 || c == 0x92 || (c > 0x9D && c < 0xA1)
				|| c == 0xB2 || (c > 0xBD && c < 0xC1) || c == 0xD2
				|| (c > 0xDD && c < 0xE1) || c == 0xF2 || c == 0xFE)
				charset = "CP949";
		}
		else if (c >= 0xA1 && c <= 0xC6) {
			c = *text++;
			if (c < 0xA1) {
				noneuc = TRUE;
				if ((c > 0x5A && c < 0x61) || (c > 0x7A && c < 0x81))
					charset = "CP1361";
				else if (c == 0x52 || c == 0x72 || c == 0x92 || (c > 0x9D && c < 0xA1))
					charset = "CP949";
				else if (c == 0xB2 || (c > 0xBD && c < 0xC1) || c == 0xD2
					|| (c > 0xDD && c < 0xE1) || c == 0xF2 || c == 0xFE)
					nonjohab = TRUE;
			}
		}
		else if (c > 0xC6 && c <= 0xD3) {
			c = *text++;
			if (c < 0xA1)
				charset = "CP1361";
		}
		else if (c > 0xD3 && c < 0xD8) {
			nonjohab = TRUE;
			c = *text++;
		}
		else if (c >= 0xD8) {
			c = *text++;
			if (c < 0xA1)
				charset = "CP1361";
		}
		if (noneuc && nonjohab)
			charset = "CP949";
	}

	if (charset == NULL) {
		if (noneuc)
			charset = "CP949";
		else
			charset = "EUC-KR";
	}

	return charset;
}

static gboolean detect_noniso(const gchar *text)
{
	guint8 c = *text;

	while ((c = *text++) != '\0') {
		if (c >= 0x80 && c <= 0x9F)
			return TRUE;
	}
	return FALSE;
}

const gchar *detect_charset(const gchar *text)
{
	guint8 c = *text;
	const gchar *charset = NULL;

	if (g_utf8_validate(text, -1, NULL)) {
		while ((c = *text++) != '\0') {
			if (c > 0x7F) {
				charset = "UTF-8";
				break;
			}
			if (c == 0x1B) /* ESC */ {
				c = *text++;
				if (c == '$') {
					c = *text++;
					switch (c) {
					case 'B': // JIS X 0208-1983
					case '@': // JIS X 0208-1978
						charset = "ISO-2022-JP";
						continue;
					case 'A': // GB2312-1980
						charset = "ISO-2022-JP-2";
						break;
					case '(':
						c = *text++;
						switch (c) {
						case 'C': // KSC5601-1987
						case 'D': // JIS X 0212-1990
							charset = "ISO-2022-JP-2";
						}
						break;
					case ')':
						c = *text++;
						if (c == 'C')
							charset = "ISO-2022-KR"; // KSC5601-1987
					}
					break;
				}
			}
		}
		if (!charset)
			charset = get_default_charset();
	}

	if (!charset) {
		switch (get_encoding_code()) {
		case LATINC:
		case LATINC_UA:
		case LATINC_TJ:
			charset = detect_charset_cylillic(text); // fuzzy...
			break;
		case CHINESE_CN:
		case CHINESE_TW:
		case CHINESE_HK:
			charset = detect_charset_chinese(text);
			break;
		case JAPANESE:
			charset = detect_charset_japanese(text);
			break;
		case KOREAN:
			charset = detect_charset_korean(text);
			break;
		case VIETNAMESE:
		case THAI:
		case GEORGIAN:
			charset = get_encoding_items(get_encoding_code())->item[OPENI18N];
			break;
		default:
			if (strcmp(get_default_charset(), "UTF-8") != 0)
				charset = get_default_charset();
			else if (detect_noniso(text))
				charset = get_encoding_items(get_encoding_code())->item[CODEPAGE];
			else
				charset = get_encoding_items(get_encoding_code())->item[OPENI18N];
			if (!charset)
				charset = get_encoding_items(get_encoding_code())->item[IANA];
		}
	}

	return charset;
}
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`/*`
			`* L3afpad - GTK+ based simple text editor`
			`* Copyright (C) 2004-2005 Tarot Osuji`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00			`*`
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`* This program is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation; either version 2 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License along`
			`* with this program; if not, write to the Free Software Foundation, Inc.,`
			`* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`
			`*/`

			`#include <glib.h>`
			`#include <string.h>`
			`#include "encoding.h"`

			`#define MAX_COUNTRY_NUM 10`

			`enum {`
			`LATIN1 = 0,`
			`LATIN2,`
			`LATIN3,`
			`LATIN4,`
			`LATINC,`
			`LATINC_UA,`
			`LATINC_TJ,`
			`LATINA,`
			`LATING,`
			`LATINH,`
			`LATIN5,`
			`CHINESE_CN,`
			`CHINESE_TW,`
			`CHINESE_HK,`
			`JAPANESE,`
			`KOREAN,`
			`VIETNAMESE,`
			`THAI,`
			`GEORGIAN,`
			`END_CODE`
			`};`

			`static const gchar *country_table[][MAX_COUNTRY_NUM] =`
			`{`
			`/* LATIN1 */ {NULL},`
			`/* LATIN2 */ {"cs", "hr", "hu", "pl", "ro", "sk", "sl", "sq", "sr", "uz"},`
			`/* LATIN3 */ {"eo", "mt", NULL},`
			`/* LATIN4 */ {"et", "lt", "lv", "mi", NULL},`
			`/* LATINC */ {"be", "bg", "ky", "mk", "mn", "ru", "tt", NULL},`
			`/* LATINC_UA */ {"uk", NULL},`
			`/* LATINC_TJ */ {"tg", NULL},`
			`/* LATINA */ {"ar", "fa", "ur", NULL},`
			`/* LATING */ {"el", NULL},`
			`/* LATINH */ {"he", "yi", NULL},`
			`/* LATIN5 */ {"az", "tr", NULL},`
			`/* CHINESE_CN */ {"zh_CN", NULL},`
			`/* CHINESE_TW */ {"zh_TW", NULL},`
			`/* CHINESE_HK */ {"zh_HK", NULL},`
			`/* JAPANESE */ {"ja", NULL},`
			`/* KOREAN */ {"ko", NULL},`
			`/* VIETNAMESE */ {"vi", NULL},`
			`/* THAI */ {"th", NULL},`
			`/* GEORGIAN */ {"ka", NULL},`
			`};`

			`static const gchar *encoding_table[][ENCODING_MAX_ITEM_NUM] =`
			`{`
			`/* IANA OpenI18N Codepage */`
			`/* LATIN1 */ { "ISO-8859-1", "ISO-8859-15", "CP1252" },`
			`/* LATIN2 */ { "ISO-8859-2", "ISO-8859-16", "CP1250" },`
			`/* LATIN3 */ { "ISO-8859-3", NULL, NULL },`
			`/* LATIN4 */ { "ISO-8859-4", "ISO-8859-13", "CP1257" },`
			`/* LATINC */ { "ISO-8859-5", "KOI8-R", "CP1251" },`
			`/* LATINC_UA */ { "ISO-8859-5", "KOI8-U", "CP1251" },`
			`/* LATINC_TJ */ { "ISO-8859-5", "KOI8-T", "CP1251" },`
			`/* LATINA */ { "ISO-8859-6", NULL, "CP1256" },`
			`/* LATING */ { "ISO-8859-7", NULL, "CP1253" },`
			`/* LATINH */ { "ISO-8859-8", NULL, "CP1255" },`
			`/* LATIN5 */ { "ISO-8859-9", NULL, "CP1254" },`
			`/* CHINESE_CN */ { "GB2312", "GB18030", "CP936" },`
			`/* CHINESE_TW */ { "BIG5", "EUC-TW", "CP950" },`
			`/* CHINESE_HK */ { "BIG5", "BIG5-HKSCS", "CP950" },`
			`/* JAPANESE */ { "ISO-2022-JP", "EUC-JP", "CP932" },`
			`/* KOREAN */ { "ISO-2022-KR", "EUC-KR", "CP949" },`
			`/* VIETNAMESE */ { NULL, "VISCII", "CP1258" },`
			`/* THAI */ { NULL, "TIS-620", "CP874" },`
			`/* GEORGIAN */ { NULL, "GEORGIAN-PS", NULL },`
			`};`

			`guint get_encoding_code(void)`
			`{`
			`static guint code = END_CODE;`
			`const gchar *env;`
			`guint i, j = 1;`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`if (code == END_CODE) {`
			`env = g_getenv("LC_ALL");`
			`if (!env)`
			`env = g_getenv("LANG");`
			`if (env && strlen(env) >= 2)`
			`while (code == END_CODE && j < END_CODE) {`
			`for (i = 0; i < MAX_COUNTRY_NUM; i++) {`
			`if (!country_table[j][i])`
			`break;`
			`if (strncmp(env, country_table[j][i], strlen(country_table[j][i])) == 0) {`
			`code = j;`
			`break;`
			`}`
			`}`
			`j++;`
			`}`
			`if (code == END_CODE)`
			`code = 0;`
			`}`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`return code;`
			`}`

			`EncArray *get_encoding_items(guint code)`
			`{`
			`gint i;`
			`static EncArray *array = NULL;`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`if (!array) {`
			`array = g_malloc(sizeof(EncArray));`
			`for (i = 0; i < ENCODING_MAX_ITEM_NUM; i++)`
			`array->item[i] = encoding_table[code][i] ?`
			`encoding_table[code][i] : NULL;`
			`}`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`return array;`
			`}`

			`const gchar *get_default_charset(void)`
			`{`
			`const gchar *charset;`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`g_get_charset(&charset);`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`return charset;`
			`}`

			`/*`
			`Imported from KEdit (for BeOS, NOT KDE).`
			`based on`
			`http://examples.oreilly.com/cjkvinfo/Ch7/DetectCodeType.c`
			`*/`
			`void convert_line_ending_to_lf(gchar *text)`
			`{`
			`gint i, j;`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`for (i = 0, j = 0; TRUE; i++, j++) {`
			`if (*(text + i) == CR) {`
			`*(text + j) = LF;`
			`if (*(text + i + 1) == LF)`
			`i++;`
			`} else {`
			`(text + j) = (text + i);`
			`if (*(text + j) == '\0')`
			`break;`
			`}`
			`}`
			`}`

			`void convert_line_ending(gchar **text, gint retcode)`
			`{`
			`gchar buf, str = *text;`
			`const gint len = strlen(str);`
			`gint i, j, LFNum = 0;`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`switch (retcode) {`
			`case CR:`
			`while (*str != '\0') {`
			`if (*str == LF)`
			`*str = CR;`
			`str++;`
			`}`
			`break;`
			`case CR+LF:`
			`for (i = 0; *(str + i) != '\0'; i++) {`
			`if (*(str + i) == LF)`
			`LFNum++;`
			`}`
			`buf = g_new(gchar, len + LFNum + 1);`
			`for (i= 0, j = 0;; i++, j++) {`
			`if (*(str + j) == LF) {`
			`*(buf + i) = CR;`
			`*(buf + (++i)) = LF;`
			`} else`
			`(buf + i) = (str + j);`
			`if (*(str + j) == '\0')`
			`break;`
			`}`
			`g_free(*text);`
			`*text = buf;`
			`}`
			`}`

			`gint detect_line_ending(const gchar *text)`
			`{`
			`while (*(text++) != '\0') {`
			`if (*text == LF)`
			`break;`
			`if (*text == CR) {`
			`if (*(++text) == LF)`
			`return CR+LF;`
			`else`
			`return CR;`
			`}`
			`}`
			`return LF;`
			`}`

			`static const gchar detect_charset_cylillic(const gchar text)`
			`{`
			`guint8 c = *text;`
			`gboolean noniso = FALSE;`
			`guint32 xc = 0, xd = 0, xef = 0;`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`const gchar *charset = get_encoding_items(get_encoding_code())->item[OPENI18N];`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`while ((c = *text++) != '\0') {`
			`if (c >= 0x80 && c <= 0x9F)`
			`noniso = TRUE;`
			`else if (c >= 0xC0 && c <= 0xCF)`
			`xc++;`
			`else if (c >= 0xD0 && c <= 0xDF)`
			`xd++;`
			`else if (c >= 0xE0)`
			`xef++;`
			`}`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`if (!noniso && ((xc + xef) < xd))`
			`charset = "ISO-8859-5";`
			`else if ((xc + xd) < xef)`
			`charset = "CP1251";`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`return charset;`
			`}`

			`static const gchar detect_charset_chinese(const gchar text)`
			`{`
			`guint8 c = *text;`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`const gchar *charset = get_encoding_items(get_encoding_code())->item[IANA];`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`while ((c = *text++) != '\0') {`
			`if (c >= 0x81 && c <= 0x87) {`
			`charset = "GB18030";`
			`break;`
			`}`
			`else if (c >= 0x88 && c <= 0xA0) {`
			`c = *text++;`
			`if ((c >= 0x30 && c <= 0x39) \|\| (c >= 0x80 && c <= 0xA0)) {`
			`charset = "GB18030";`
			`break;`
			`} //else GBK/Big5-HKSCS cannot determine`
			`}`
			`else if ((c >= 0xA1 && c <= 0xC6) \|\| (c >= 0xC9 && c <= 0xF9)) {`
			`c = *text++;`
			`if (c >= 0x40 && c <= 0x7E)`
			`charset = "BIG5";`
			`else if ((c >= 0x30 && c <= 0x39) \|\| (c >= 0x80 && c <= 0xA0)) {`
			`charset = "GB18030";`
			`break;`
			`}`
			`}`
			`else if (c >= 0xC7) {`
			`c = *text++;`
			`if ((c >= 0x30 && c <= 0x39) \|\| (c >= 0x80 && c <= 0xA0)) {`
			`charset = "GB18030";`
			`break;`
			`}`
			`}`
			`}`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`return charset;`
			`}`

			`static const gchar detect_charset_japanese(const gchar text)`
			`{`
			`guint8 c = *text;`
			`gchar *charset = NULL;`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`while (charset == NULL && (c = *text++) != '\0') {`
			`if (c >= 0x81 && c <= 0x9F) {`
			`if (c == 0x8E) /* SS2 */ {`
			`c = *text++;`
			`if ((c >= 0x40 && c <= 0xA0) \|\| (c >= 0xE0 && c <= 0xFC))`
			`charset = "CP932";`
			`}`
			`else if (c == 0x8F) /* SS3 */ {`
			`c = *text++;`
			`if (c >= 0x40 && c <= 0xA0)`
			`charset = "CP932";`
			`else if (c >= 0xFD)`
			`break;`
			`}`
			`else`
			`charset = "CP932";`
			`}`
			`else if (c >= 0xA1 && c <= 0xDF) {`
			`c = *text++;`
			`if (c <= 0x9F)`
			`charset = "CP932";`
			`else if (c >= 0xFD)`
			`break;`
			`}`
			`else if (c >= 0xE0 && c <= 0xEF) {`
			`c = *text++;`
			`if (c >= 0x40 && c <= 0xA0)`
			`charset = "CP932";`
			`else if (c >= 0xFD)`
			`break;`
			`}`
			`else if (c >= 0xF0)`
			`break;`
			`}`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`if (charset == NULL)`
			`charset = "EUC-JP";`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`return charset;`
			`}`

			`static const gchar detect_charset_korean(const gchar text)`
			`{`
			`guint8 c = *text;`
			`gboolean noneuc = FALSE;`
			`gboolean nonjohab = FALSE;`
			`gchar *charset = NULL;`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`while (charset == NULL && (c = *text++) != '\0') {`
			`if (c >= 0x81 && c < 0x84) {`
			`charset = "CP949";`
			`}`
			`else if (c >= 0x84 && c < 0xA1) {`
			`noneuc = TRUE;`
			`c = *text++;`
			`if ((c > 0x5A && c < 0x61) \|\| (c > 0x7A && c < 0x81))`
			`charset = "CP1361";`
			`else if (c == 0x52 \|\| c == 0x72 \|\| c == 0x92 \|\| (c > 0x9D && c < 0xA1)`
			`\|\| c == 0xB2 \|\| (c > 0xBD && c < 0xC1) \|\| c == 0xD2`
			`\|\| (c > 0xDD && c < 0xE1) \|\| c == 0xF2 \|\| c == 0xFE)`
			`charset = "CP949";`
			`}`
			`else if (c >= 0xA1 && c <= 0xC6) {`
			`c = *text++;`
			`if (c < 0xA1) {`
			`noneuc = TRUE;`
			`if ((c > 0x5A && c < 0x61) \|\| (c > 0x7A && c < 0x81))`
			`charset = "CP1361";`
			`else if (c == 0x52 \|\| c == 0x72 \|\| c == 0x92 \|\| (c > 0x9D && c < 0xA1))`
			`charset = "CP949";`
			`else if (c == 0xB2 \|\| (c > 0xBD && c < 0xC1) \|\| c == 0xD2`
			`\|\| (c > 0xDD && c < 0xE1) \|\| c == 0xF2 \|\| c == 0xFE)`
			`nonjohab = TRUE;`
			`}`
			`}`
			`else if (c > 0xC6 && c <= 0xD3) {`
			`c = *text++;`
			`if (c < 0xA1)`
			`charset = "CP1361";`
			`}`
			`else if (c > 0xD3 && c < 0xD8) {`
			`nonjohab = TRUE;`
			`c = *text++;`
			`}`
			`else if (c >= 0xD8) {`
			`c = *text++;`
			`if (c < 0xA1)`
			`charset = "CP1361";`
			`}`
			`if (noneuc && nonjohab)`
			`charset = "CP949";`
			`}`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`if (charset == NULL) {`
			`if (noneuc)`
			`charset = "CP949";`
			`else`
			`charset = "EUC-KR";`
			`}`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`return charset;`
			`}`

			`static gboolean detect_noniso(const gchar *text)`
			`{`
			`guint8 c = *text;`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`while ((c = *text++) != '\0') {`
			`if (c >= 0x80 && c <= 0x9F)`
			`return TRUE;`
			`}`
			`return FALSE;`
			`}`

			`const gchar detect_charset(const gchar text)`
			`{`
			`guint8 c = *text;`
			`const gchar *charset = NULL;`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`if (g_utf8_validate(text, -1, NULL)) {`
			`while ((c = *text++) != '\0') {`
			`if (c > 0x7F) {`
			`charset = "UTF-8";`
			`break;`
			`}`
			`if (c == 0x1B) /* ESC */ {`
			`c = *text++;`
			`if (c == '$') {`
			`c = *text++;`
			`switch (c) {`
			`case 'B': // JIS X 0208-1983`
			`case '@': // JIS X 0208-1978`
			`charset = "ISO-2022-JP";`
			`continue;`
			`case 'A': // GB2312-1980`
			`charset = "ISO-2022-JP-2";`
			`break;`
			`case '(':`
			`c = *text++;`
			`switch (c) {`
			`case 'C': // KSC5601-1987`
			`case 'D': // JIS X 0212-1990`
			`charset = "ISO-2022-JP-2";`
			`}`
			`break;`
			`case ')':`
			`c = *text++;`
			`if (c == 'C')`
			`charset = "ISO-2022-KR"; // KSC5601-1987`
			`}`
			`break;`
			`}`
			`}`
			`}`
			`if (!charset)`
			`charset = get_default_charset();`
			`}`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`if (!charset) {`
			`switch (get_encoding_code()) {`
			`case LATINC:`
			`case LATINC_UA:`
			`case LATINC_TJ:`
			`charset = detect_charset_cylillic(text); // fuzzy...`
			`break;`
			`case CHINESE_CN:`
			`case CHINESE_TW:`
			`case CHINESE_HK:`
			`charset = detect_charset_chinese(text);`
			`break;`
			`case JAPANESE:`
			`charset = detect_charset_japanese(text);`
			`break;`
			`case KOREAN:`
			`charset = detect_charset_korean(text);`
			`break;`
			`case VIETNAMESE:`
			`case THAI:`
			`case GEORGIAN:`
			`charset = get_encoding_items(get_encoding_code())->item[OPENI18N];`
			`break;`
			`default:`
			`if (strcmp(get_default_charset(), "UTF-8") != 0)`
			`charset = get_default_charset();`
			`else if (detect_noniso(text))`
			`charset = get_encoding_items(get_encoding_code())->item[CODEPAGE];`
			`else`
			`charset = get_encoding_items(get_encoding_code())->item[OPENI18N];`
			`if (!charset)`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00			`charset = get_encoding_items(get_encoding_code())->item[IANA];`
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`}`
			`}`
* l3afpad-0.8.18.1.3 2011-12-12 14:35:35 +01:00
* l3afpad-0.8.18.1.1 2011-12-12 14:29:30 +01:00			`return charset;`
			`}`