Download
Getting Started
Members
Projects
Community
Marketplace
Events
Planet Eclipse
Newsletter
Videos
Participate
Report a Bug
Forums
Mailing Lists
Wiki
IRC
How to Contribute
Working Groups
Automotive
Internet of Things
LocationTech
Long-Term Support
PolarSys
Science
OpenMDM
More
Community
Marketplace
Events
Planet Eclipse
Newsletter
Videos
Participate
Report a Bug
Forums
Mailing Lists
Wiki
IRC
How to Contribute
Working Groups
Automotive
Internet of Things
LocationTech
Long-Term Support
PolarSys
Science
OpenMDM
Toggle navigation
Bugzilla – Attachment 166430 Details for
Bug 237567
[document] html editor mishandles encoding - it may destroy file
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Requests
|
Help
|
Log In
[x]
|
Terms of Use
|
Copyright Agent
Some Eclipse Foundation services are deprecated, or will be soon. Please ensure you've read
this important communication.
sample code (EncodingGuesser.java)
EncodingGuesser.java (text/plain), 8.54 KB, created by
Toshihiro Izumi
on 2010-04-29 02:20:07 EDT
(
hide
)
Description:
sample code (EncodingGuesser.java)
Filename:
MIME Type:
Creator:
Toshihiro Izumi
Created:
2010-04-29 02:20:07 EDT
Size:
8.54 KB
patch
obsolete
>/******************************************************************************* > * Copyright (c) 2001, 2005 IBM Corporation and others. > * All rights reserved. This program and the accompanying materials > * are made available under the terms of the Eclipse Public License v1.0 > * which accompanies this distribution, and is available at > * http://www.eclipse.org/legal/epl-v10.html > * > * Contributors: > * IBM Corporation - initial API and implementation > * Jens Lukowski/Innoopract - initial renaming/restructuring > * > *******************************************************************************/ >package org.eclipse.wst.html.core.internal.contenttype; > >/** > * > * This is ported from PageDesigner's hpbcom/Kanji.cpp's > * Kanji::guess_kanji_code(), > * > */ >public class EncodingGuesser { > private static final int ASCII = 0; // ASCII > // ISO-2022-JP > private static final int ASCII_IN = 8; // This is after ISO2022's change > // Shift-JIS > private static final int EUC_HALFKANA = 6; // This is Half Kana in EUC-JP > private static final int EUC_JP = 3; // This is EUC-JP > private static final int ISO2022_JP = 4; // This is ISO-2022-JP > private static final int JIS_HALFKANA = 7; // THis is Half Kana in > private static final byte KT_EUC1 = 0x40; > private static final byte KT_EUC2 = (byte) 0x80; > // ASCII > private static final byte KT_JIN = 0x01; > private static final byte KT_JOUT = 0x02; > // private static final byte KT_ESC = 0x04; > // private static final byte KT_JIS = 0x08; > private static final byte KT_SFT1 = 0x10; > private static final byte KT_SFT2 = 0x20; > private static final byte ktype[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00 */ > 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10 */ > 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x08, 0x08, 0x09, 0x08, 0x08, 0x08, /* !"#$%&' *//* " */ > 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, /* ()*+,-./ */ > 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, /* 01234567 */ > 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, /* 89:; <=>? */ > 0x29, 0x28, 0x2b, 0x28, 0x28, 0x28, 0x28, 0x28, /* @ABCDEFG */ > 0x2a, 0x28, 0x2a, 0x28, 0x28, 0x28, 0x28, 0x28, /* HIJKLMNO */ > 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* PQRSTUVW */ > 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* XYZ[\]^_ */ > 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* abcdefg */ > 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* hijklmno */ > 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* pqrstuvw */ > 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x00, /* xyz{|}~ */ > 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, /* 80 */ > 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, /* 90 */ > 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x20, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, /* A0 */ > (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, /* B0 */ > (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, /* C0 */ > (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, /* D0 */ > (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, /* E0 */ > (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, /* F0 */ > (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xc0, (byte) 0xc0, 0x00,}; > // private static final int ISO8859_1 = 1; // ISO-1 > private static final int SHIFT_JIS = 2; // This is Shift-JIS > private static final int SJIS_HALFKANA = 5; // This is Half Kana in > > private static final int UTF_8 = 9; > > /** > * Currently, only Japanese encodings are supported. > */ > private static final int UNKNOWN = -1; // Unknown > > /** > * @return java.lang.String > * @param code > * int > * > * Convert private int to IANA Encoding name. > */ > private static String convertToIANAEncodingName(int code) { > String encoding = null; > > switch (code) { > case UTF_8: > encoding = "UTF-8";//$NON-NLS-1$ > break; > case SHIFT_JIS : > case SJIS_HALFKANA : > encoding = "Shift_JIS";//$NON-NLS-1$ > break; > case EUC_JP : > case EUC_HALFKANA : > encoding = "EUC-JP";//$NON-NLS-1$ > break; > case ISO2022_JP : > case JIS_HALFKANA : > encoding = "ISO-2022-JP";//$NON-NLS-1$ > default : > break; > } > > return encoding; > } > > /** > * Return guessed Java Encoding name target: bytes to be inspected length: > * length of target > */ > public static String guessEncoding(byte[] target, int length) { > int code = UNKNOWN; > > // Currently, only Japanese is supported. > String system_ctype = java.util.Locale.getDefault().getLanguage(); > String jp_ctype = java.util.Locale.JAPANESE.getLanguage(); > if (system_ctype.compareTo(jp_ctype) == 0) { > // Ok, I'm under ja_JP. > code = ASCII; > int pos = 0; > while ((code == ASCII) && (length > 0)) { > int ch1 = target[pos]; > ch1 = ch1 & 0x000000FF; > int ch2 = (length >= 2) ? target[pos + 1] : 0; > ch2 = ch2 & 0x000000FF; > int ch3 = (length >= 3) ? target[pos + 2] : 0; > ch3 = ch3 & 0x000000FF; > int ch4 = (length >= 4) ? target[pos + 3] : 0; > ch4 = ch4 & 0x000000FF; > code = guessJapaneseKanjiCode(ch1, ch2, ch3, ch4, 0); > pos++; > length--; > } > switch (code) { > case UTF_8: > code = UTF_8; > break; > case ISO2022_JP : > case JIS_HALFKANA : > code = ISO2022_JP; > break; > case EUC_JP : > code = EUC_JP; > break; > default : > code = SHIFT_JIS; > } > } > return (convertToIANAEncodingName(code)); > } > > /** > * Guess the encoding. halfkana_flag = 0x01 ( detect SJIS half kana ) > * halfkana_flag = 0x02 ( detect EUC half kana ) > */ > private static int guessJapaneseKanjiCode(int ch1, int ch2, int ch3, int ch4, int halfkana_flag) { > boolean sjis_hankaku_flag = ((halfkana_flag & 0x01) != 0) ? true : false; > boolean euc_hankaku_flag = ((halfkana_flag & 0x02) != 0) ? true : false; > > if (ch1 == 0) > return UNKNOWN; > //XXX guessing UTF-8 > // reference: http://ja.wikipedia.org/wiki/UTF-8 > if (inRange(ch1, 0xf0, 0xf7) && inRange(ch2, 0x80, 0xbf) > && inRange(ch3, 0x80, 0xbf) && inRange(ch4, 0x80, 0xbf)) { > // (f0-f7)(80-bf)(80-bf)(80-bf) > // Shift JIS kanji 1st byte : 0x81-0x9F, 0xE0-0xFC > // Shift JIS kanji 2nd byte : 0x40-0x7E, 0x80-0xFC > // if ch is in overlapped area, it may be either utf-8 or sjis > return UTF_8; > } else if (inRange(ch1, 0xe0, 0xef) && inRange(ch2, 0x80, 0xbf) > && inRange(ch3, 0x80, 0xbf)) { > // (e0-ef)(80-bf)(80-bf) > // Shift JIS kanji 1st byte : 0x81-0x9F, 0xE0-0xFC > // Shift JIS kanji 2nd byte : 0x40-0x7E, 0x80-0xFC > // if ch is in overlapped area, it may be either utf-8 or sjis > return UTF_8; > } else if (inRange(ch1, 0xc0, 0xdf) && inRange(ch2, 0x80, 0xbf)) { > // (c0-df)(80-bf) > // Shift JIS half kana : 0xA1-0xDF > // if ch is in overlapped area, it may be either utf-8 or sjis > return UTF_8; > } > // end guessing UTF-8 > if (sjis_hankaku_flag && ch1 >= 0xa1 && ch1 <= 0xdf) > return SJIS_HALFKANA; > else if (euc_hankaku_flag && ch1 == 0x8e && ch2 >= 0xa1 && ch2 <= 0xdf) > return EUC_HALFKANA; > else if (((ktype[ch1] & KT_SFT1) != 0) && ((ktype[ch2] & KT_SFT2) != 0)) > return SHIFT_JIS; > else if (((ktype[ch1] & KT_EUC1) != 0) && ((ktype[ch2] & KT_EUC2) != 0)) > return EUC_JP; > else if (ch1 == 0x1b && ((ktype[ch2] & KT_JIN) != 0)) > return ISO2022_JP; > else if (ch1 >= 0xa1 && ch1 <= 0xdf) > return SJIS_HALFKANA; > else if (ch1 == 0x1b && ch2 == 0x28/* '(' */&& ch3 == 0x49/* 'I' */) > return JIS_HALFKANA; > else if (ch1 == 0x1b && ch2 == 0x28/* '(' */&& ((ktype[ch3] & KT_JOUT) != 0)) > return ASCII_IN; > > return ASCII; > } > > private static boolean inRange(int ch, int low, int high) { > return ch >= low && ch <= high; > } > > public EncodingGuesser() { > super(); > } >}
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Raw
Actions:
View
Attachments on
bug 237567
:
105234
|
105235
|
105236
|
105237
|
105238
| 166430