|
/* * Copyright (c) 2002-2003 Che, Dong Email: chedongATbigfoot.com/chedongATchedong.com * $Id: HelloUnicode.java,v 1.3 2003/03/09 08:41:46 chedong Exp $ */
import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter;
/** * 目的: * 测试不同字符编码解码方式对多字节编码(中文)处理 * 的影响 输入: * 可以从命令行输入测试字符串 输出: * 测试1 按照不同解码方式处理字符串, * 并按不同编码方式写入文件 * 测试2 按照不同解码方式从文件中将字符串读出 * @author Che, Dong */
class HelloUnicode { /** * main entrance * @param args command line arguments */ public static void main(String[] args) { String hello = "Hello world 世界你好";
//read from command line input if (args.length > 0) { hello = args[0]; }
try { /* * 试验1: 从测试字符串按系统缺省编码方式解码,并写入文件 */ System.out.println(">>>>testing1: write hello world to files<<<<"); System.out.println("[test 1-1]: with system default encoding=" + System.getProperty("file.encoding") + "\nstring=" + hello + "\tlength=" + hello.length()); printCharArray(hello); writeFile("hello.orig.html", hello);
//把字符串按GB2312解码 hello = new String(hello.getBytes(), "GB2312"); System.out.println( "[test 1-2]: getBytes with platform default encoding and decoding as gb2312:\nstring=" + hello + "\tlength=" + hello.length()); writeFile("hello.gb2312.html", hello); printCharArray(hello);
//把字符串按UTF8解码成字节流,并打印相应的字节 hello = new String(hello.getBytes("UTF8")); System.out.println("[test 1-3]: convert string to UTF8\nstring=" + hello + "\tlength=" + hello.length()); writeFile("hello.utf8.html", hello); printCharArray(hello);
/* * 试验2: 从试验1的输出文件中读取,并按照不同方式解码 */ System.out.println( ">>>>testing2: reading and decoding from files<<<<");
//first file: encoding with system default hello = readFile("hello.orig.html"); System.out.println( "[test 2-1]: read hello.orig.html: decoding with system default encoding\nstring=" + hello + "\tlength=" + hello.length()); printCharArray(hello);
//second file: decoding from GBK hello = readFile("hello.gb2312.html"); // hello = new String(hello.getBytes(), "GB2312"); System.out.println( "[test 2-2]: read hello.gb2312.html: decoding as GB2312\nstring=" + hello + "\tlength=" + hello.length()); printCharArray(hello);
//third file: decoding from UTF8 hello = readFile("hello.utf8.html"); hello = new String(hello.getBytes(), "UTF8"); System.out.println( "[test 2-3]: read hello.utf8.html: decoding as UTF8\nstring=" + hello + "\tlength=" + hello.length()); printCharArray(hello); } catch (Exception e) { System.out.println(e.toString()); } }
/** * print char array * @param inStr input string */ public static void printCharArray(String inStr) { char[] myBuffer = inStr.toCharArray();
//list each Charactor in byte value, short value, and UnicodeBlock Mapping for (int i = 0; i < inStr.length(); i++) { byte b = (byte) myBuffer[i]; short s = (short) myBuffer[i]; String hexB = Integer.toHexString(b).toUpperCase(); String hexS = Integer.toHexString(s).toUpperCase(); StringBuffer sb = new StringBuffer();
//print char sb.append("char["); sb.append(i); sb.append("]='"); sb.append(myBuffer[i]); sb.append("'\t");
//byte value sb.append("byte="); sb.append(b); sb.append(" \\u"); sb.append(hexB); sb.append('\t');
//short value sb.append("short="); sb.append(s); sb.append(" \\u"); sb.append(hexS); sb.append('\t');
//Unicode Block sb.append(Character.UnicodeBlock.of(myBuffer[i]));
System.out.println(sb.toString()); }
System.out.println(); }
/** * write content to output file * @param fileName output file name * @param content file content to write */ private static void writeFile(String fileName, String content) { try { File tmpFile = new File(fileName);
if (tmpFile.exists()) { tmpFile.delete(); }
FileWriter fw = new FileWriter(fileName, true); fw.write(content); fw.close(); } catch (Exception e) { System.out.println(e.toString()); } }
/** * read content from input file * @param fileName input file name * @return String file content */ private static String readFile(String fileName) { try { BufferedReader fr = new BufferedReader(new FileReader(fileName)); StringBuffer out = new StringBuffer(); String thisLine = new String();
while (thisLine != null) { thisLine = fr.readLine();
if (thisLine != null) { out.append(thisLine); } }
fr.close();
return out.toString(); } catch (Exception e) { System.out.print(e.toString()); return null; } } }
运行结果:
C:\ja>java HelloUnicode >>>>testing1: write hello world to files<<<< [test 1-1]: with system default encoding=GBK string=Hello world 世界你好 length=16 char[0]='H' byte=72 \u48 short=72 \u48 BASIC_LATIN char[1]='e' byte=101 \u65 short=101 \u65 BASIC_LATIN char[2]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[3]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[4]='o' byte=111 \u6F short=111 \u6F BASIC_LATIN char[5]=' ' byte=32 \u20 short=32 \u20 BASIC_LATIN char[6]='w' byte=119 \u77 short=119 \u77 BASIC_LATIN char[7]='o' byte=111 \u6F short=111 \u6F BASIC_LATIN char[8]='r' byte=114 \u72 short=114 \u72 BASIC_LATIN char[9]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[10]='d' byte=100 \u64 short=100 \u64 BASIC_LATIN char[11]=' ' byte=32 \u20 short=32 \u20 BASIC_LATIN char[12]='世' byte=22 \u16 short=19990 \u4E16 CJK_UNIFIED_IDEOGRAPHS char[13]='界' byte=76 \u4C short=30028 \u754C CJK_UNIFIED_IDEOGRAPHS char[14]='你' byte=96 \u60 short=20320 \u4F60 CJK_UNIFIED_IDEOGRAPHS char[15]='好' byte=125 \u7D short=22909 \u597D CJK_UNIFIED_IDEOGRAPHS
[test 1-2]: getBytes with platform default encoding and decoding as gb2312: string=Hello world 世界你好 length=16 char[0]='H' byte=72 \u48 short=72 \u48 BASIC_LATIN char[1]='e' byte=101 \u65 short=101 \u65 BASIC_LATIN char[2]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[3]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[4]='o' byte=111 \u6F short=111 \u6F BASIC_LATIN char[5]=' ' byte=32 \u20 short=32 \u20 BASIC_LATIN char[6]='w' byte=119 \u77 short=119 \u77 BASIC_LATIN char[7]='o' byte=111 \u6F short=111 \u6F BASIC_LATIN char[8]='r' byte=114 \u72 short=114 \u72 BASIC_LATIN char[9]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[10]='d' byte=100 \u64 short=100 \u64 BASIC_LATIN char[11]=' ' byte=32 \u20 short=32 \u20 BASIC_LATIN char[12]='世' byte=22 \u16 short=19990 \u4E16 CJK_UNIFIED_IDEOGRAPHS char[13]='界' byte=76 \u4C short=30028 \u754C CJK_UNIFIED_IDEOGRAPHS char[14]='你' byte=96 \u60 short=20320 \u4F60 CJK_UNIFIED_IDEOGRAPHS char[15]='好' byte=125 \u7D short=22909 \u597D CJK_UNIFIED_IDEOGRAPHS
[test 1-3]: convert string to UTF8 string=Hello world 涓栫晫浣犲ソ length=18 char[0]='H' byte=72 \u48 short=72 \u48 BASIC_LATIN char[1]='e' byte=101 \u65 short=101 \u65 BASIC_LATIN char[2]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[3]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[4]='o' byte=111 \u6F short=111 \u6F BASIC_LATIN char[5]=' ' byte=32 \u20 short=32 \u20 BASIC_LATIN char[6]='w' byte=119 \u77 short=119 \u77 BASIC_LATIN char[7]='o' byte=111 \u6F short=111 \u6F BASIC_LATIN char[8]='r' byte=114 \u72 short=114 \u72 BASIC_LATIN char[9]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[10]='d' byte=100 \u64 short=100 \u64 BASIC_LATIN char[11]=' ' byte=32 \u20 short=32 \u20 BASIC_LATIN char[12]='涓' byte=-109 \uFFFFFF93 short=28051 \u6D93 CJK_UNIFIED_IDEO GRAPHS char[13]='栫' byte=43 \u2B short=26667 \u682B CJK_UNIFIED_IDEOGRAPHS char[14]='晫' byte=107 \u6B short=26219 \u666B CJK_UNIFIED_IDEOGRAPHS char[15]='浣' byte=99 \u63 short=28003 \u6D63 CJK_UNIFIED_IDEOGRAPHS char[16]='犲' byte=-78 \uFFFFFFB2 short=29362 \u72B2 CJK_UNIFIED_IDEO GRAPHS char[17]='ソ' byte=-67 \uFFFFFFBD short=12477 \u30BD KATAKANA
>>>>testing2: reading and decoding from files<<<< [test 2-1]: read hello.orig.html: decoding with system default encoding string=Hello world 世界你好 length=16 char[0]='H' byte=72 \u48 short=72 \u48 BASIC_LATIN char[1]='e' byte=101 \u65 short=101 \u65 BASIC_LATIN char[2]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[3]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[4]='o' byte=111 \u6F short=111 \u6F BASIC_LATIN char[5]=' ' byte=32 \u20 short=32 \u20 BASIC_LATIN char[6]='w' byte=119 \u77 short=119 \u77 BASIC_LATIN char[7]='o' byte=111 \u6F short=111 \u6F BASIC_LATIN char[8]='r' byte=114 \u72 short=114 \u72 BASIC_LATIN char[9]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[10]='d' byte=100 \u64 short=100 \u64 BASIC_LATIN char[11]=' ' byte=32 \u20 short=32 \u20 BASIC_LATIN char[12]='世' byte=22 \u16 short=19990 \u4E16 CJK_UNIFIED_IDEOGRAPHS char[13]='界' byte=76 \u4C short=30028 \u754C CJK_UNIFIED_IDEOGRAPHS char[14]='你' byte=96 \u60 short=20320 \u4F60 CJK_UNIFIED_IDEOGRAPHS char[15]='好' byte=125 \u7D short=22909 \u597D CJK_UNIFIED_IDEOGRAPHS
[test 2-2]: read hello.gb2312.html: decoding as GB2312 string=Hello world 世界你好 length=16 char[0]='H' byte=72 \u48 short=72 \u48 BASIC_LATIN char[1]='e' byte=101 \u65 short=101 \u65 BASIC_LATIN char[2]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[3]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[4]='o' byte=111 \u6F short=111 \u6F BASIC_LATIN char[5]=' ' byte=32 \u20 short=32 \u20 BASIC_LATIN char[6]='w' byte=119 \u77 short=119 \u77 BASIC_LATIN char[7]='o' byte=111 \u6F short=111 \u6F BASIC_LATIN char[8]='r' byte=114 \u72 short=114 \u72 BASIC_LATIN char[9]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[10]='d' byte=100 \u64 short=100 \u64 BASIC_LATIN char[11]=' ' byte=32 \u20 short=32 \u20 BASIC_LATIN char[12]='世' byte=22 \u16 short=19990 \u4E16 CJK_UNIFIED_IDEOGRAPHS char[13]='界' byte=76 \u4C short=30028 \u754C CJK_UNIFIED_IDEOGRAPHS char[14]='你' byte=96 \u60 short=20320 \u4F60 CJK_UNIFIED_IDEOGRAPHS char[15]='好' byte=125 \u7D short=22909 \u597D CJK_UNIFIED_IDEOGRAPHS
[test 2-3]: read hello.utf8.html: decoding as UTF8 string=Hello world 世界你好 length=16 char[0]='H' byte=72 \u48 short=72 \u48 BASIC_LATIN char[1]='e' byte=101 \u65 short=101 \u65 BASIC_LATIN char[2]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[3]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[4]='o' byte=111 \u6F short=111 \u6F BASIC_LATIN char[5]=' ' byte=32 \u20 short=32 \u20 BASIC_LATIN char[6]='w' byte=119 \u77 short=119 \u77 BASIC_LATIN char[7]='o' byte=111 \u6F short=111 \u6F BASIC_LATIN char[8]='r' byte=114 \u72 short=114 \u72 BASIC_LATIN char[9]='l' byte=108 \u6C short=108 \u6C BASIC_LATIN char[10]='d' byte=100 \u64 short=100 \u64 BASIC_LATIN char[11]=' ' byte=32 \u20 short=32 \u20 BASIC_LATIN char[12]='世' byte=22 \u16 short=19990 \u4E16 CJK_UNIFIED_IDEOGRAPHS char[13]='界' byte=76 \u4C short=30028 \u754C CJK_UNIFIED_IDEOGRAPHS char[14]='你' byte=96 \u60 short=20320 \u4F60 CJK_UNIFIED_IDEOGRAPHS char[15]='好' byte=125 \u7D short=22909 \u597D CJK_UNIFIED_IDEOGRAPHS
|