My Java Note [我的爪哇筆記]: 2月 2009

2009年2月18日星期三

JAVA 寫UTF-8文字檔關於BOM議題...

如果各位使用JAVA IO的OutputStreamWriter指定編碼為UTF-8寫檔時，單純寫ASCII英文字和數字、符號不包含中文字寫出的檔案，使用windows的記事本另存新檔看其編碼為「ANSI」但若檔案內包含中文字(或任何非ASCII文字)會發現記事本另存新檔看其編碼為「UTF-8」。
明明不是我們指定UTF-8了嗎？一切都是BOM的問題，這又是另一段故事了，欲知詳情請上Google....

這裡單純說明如何在上列兩任一情況下如何一致讓記事本另存新檔看其編碼為「UTF-8」，下列兩隻CLASS供各位參考：


import java.io.IOException;
import java.io.PushbackInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import sun.nio.cs.StreamDecoder;

public class UnicodeInputStreamReader extends InputStreamReader {
 
 private static final int BOM_SIZE = 4;
 private final StreamDecoder decoder;
 private PushbackInputStream pushBack;
 private String encode;
 private String defaultEnc;
 
 public UnicodeInputStreamReader(InputStream input, String defaultEnc) throws UnsupportedEncodingException {
  
  super(input);
  
  try {
   this.defaultEnc = defaultEnc;
   this.pushBack   = new PushbackInputStream(input, BOM_SIZE);
   init();
  } catch (Exception e) {
   e.printStackTrace();
  }
  
  this.decoder = StreamDecoder.forInputStreamReader(this.pushBack, this, this.encode);
 }
 
 private void init() throws IOException {
  
  byte[] bom = new byte[BOM_SIZE];
  int n, unread;
  
  // 初始讀取一次
  n = this.pushBack.read(bom, 0, bom.length);
  
  // 比對表頭
  if ( (bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF) ) {
   this.encode = "UTF-32BE";
   unread = n - 4;
  } else if ( (bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00) ) {
   this.encode = "UTF-32LE";
   unread = n - 4;
  } else if ( (bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF) ) {
   this.encode = "UTF-8";
   unread = n - 3;
  } else if ( (bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF) ) {
   this.encode = "UTF-16BE";
   unread = n - 2;
  } else if ( (bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) ) {
   this.encode = "UTF-16LE";
   unread = n - 2;
  } else {
   // 如果沒有找到任何表頭, 則退回長度等於原先總長
   this.encode = this.defaultEnc;
   unread = n;
  }
  
//  System.out.println("has BOM=" + ((unread == n) ? false : true) + ", encode=" + encode + ", read=" + n + ", unread=" + unread);
  // 計算應該退回多少byte
  if ( unread > 0 ) {
   this.pushBack.unread(bom, (n - unread), unread);
  }
  
 }
 
 public String getEncoding() {
  return this.encode;
 }
 
 public int read() throws IOException {
  return this.decoder.read();
 }
 
 public int read(char cbuf[], int offset, int length) throws IOException {
  return this.decoder.read(cbuf, offset, length);
 }
 
 public boolean ready() throws IOException {
  return this.decoder.ready();
 }
 
 public void close() throws IOException {
  this.decoder.close();
 }
}


import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;

public class UTF8OutputStreamWriter extends OutputStreamWriter{
 public UTF8OutputStreamWriter(OutputStream pos) throws IOException{ 
  super(pos,"UTF-8");
  
  // write UTF8 BOM mark if file is empty
           final byte[] bom = new byte[] { (byte)0xEF, (byte)0xBB, (byte)0xBF };
           pos.write(bom);  
  
 }
 
 public UTF8OutputStreamWriter(OutputStream pos,String pencoding) throws IOException{
  this(pos);
 }
}

部落客廣告聯播

2009年2月18日 星期三

JAVA 寫UTF-8文字檔關於BOM議題...

2009年2月18日星期三