import java.io.*;
import org.textmining.text.extraction.WordExtractor;
* Title: pdf extraction
* Company: Matrix.org.cn
* @version 1.0,who use this example pls remain the declare
public class PdfExtractor {
public PdfExtractor() {
public static void main(String args[]) throws Exception
FileInputStream in = new FileInputStream ("c://a.doc");
WordExtractor extractor = new WordExtractor();
String str = extractor.extractText(in);
System.out.println("the result length is"+str.length());
System.out.println("the result is"+str);
- public class WordExtractor {
- public WordExtractor() {
- }
- public String extractText(InputStream in) throws IOException {
- ArrayList text = new ArrayList();
- POIFSFileSystem fsys = new POIFSFileSystem(in);
- DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
- DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
- byte[] header = new byte[headerProps.getSize()];
- din.read(header);
- din.close();
- // Prende le informazioni dall'header del documento
- int info = LittleEndian.getShort(header, 0xa);
- boolean useTable1 = (info & 0x200) != 0;
- //boolean useTable1 = true;
- // Prende informazioni dalla piece table
- int complexOffset = LittleEndian.getInt(header, 0x1a2);
- //int complexOffset = LittleEndian.getInt(header);
- String tableName = null;
- if (useTable1) {
- tableName = "1Table";
- } else {
- tableName = "0Table";
- }
- DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
- byte[] tableStream = new byte[table.getSize()];
- din = fsys.createDocumentInputStream(tableName);
- din.read(tableStream);
- din.close();
- din = null;
- fsys = null;
- table = null;
- headerProps = null;
- int multiple = findText(tableStream, complexOffset, text);
- StringBuffer sb = new StringBuffer();
- int size = text.size();
- tableStream = null;
- for (int x = 0; x < size; x++) {
- WordTextPiece nextPiece = (WordTextPiece) text.get(x);
- int start = nextPiece.getStart();
- int length = nextPiece.getLength();
- boolean unicode = nextPiece.usesUnicode();
- String toStr = null;
- if (unicode) {
- toStr = new String(header, start, length * multiple, "UTF-16LE");
- } else {
- toStr = new String(header, start, length, "ISO-8859-1");
- }
- sb.append(toStr).append(" ");
- }
- return sb.toString();
- }
- private static int findText(byte[] tableStream, int complexOffset, ArrayList text)
- throws IOException {
- //actual text
- int pos = complexOffset;
- int multiple = 2;
- //skips through the prms before we reach the piece table. These contain data
- //for actual fast saved files
- while (tableStream[pos] == 1) {
- pos++;
- int skip = LittleEndian.getShort(tableStream, pos);
- pos += 2 + skip;
- }
- if (tableStream[pos] != 2) {
- throw new IOException("corrupted Word file");
- } else {
- //parse out the text pieces
- int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
- pos += 4;
- int pieces = (pieceTableSize - 4) / 12;
- for (int x = 0; x < pieces; x++) {
- int filePos =
- LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) + (x *"/images/forum/smiles/icon_cool.gif"/> + 2);
- boolean unicode = false;
- if ((filePos & 0x40000000) == 0) {
- unicode = true;
- } else {
- unicode = false;
- multiple = 1;
- filePos &= ~(0x40000000); //gives me FC in doc stream
- filePos /= 2;
- }
- int totLength =
- LittleEndian.getInt(tableStream, pos + (x + 1) * 4)
- - LittleEndian.getInt(tableStream, pos + (x * 4));
- WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
- text.add(piece);
- }
- }
- return multiple;
- }
- public static void main(String[] args){
- WordExtractor w = new WordExtractor();
- POIFSFileSystem ps = new POIFSFileSystem();
- try{
- File file = new File("C://test.doc");
- InputStream in = new FileInputStream(file);
- String s = w.extractText(in);
- System.out.println(s);
- }catch(Exception e){
- e.printStackTrace();
- }
- }
- }
- class WordTextPiece {
- private int _fcStart;
- private boolean _usesUnicode;
- private int _length;
- public WordTextPiece(int start, int length, boolean unicode) {
- _usesUnicode = unicode;
- _length = length;
- _fcStart = start;
- }
- public boolean usesUnicode() {
- return _usesUnicode;
- }
- public int getStart() {
- return _fcStart;
- }
- public int getLength() {
- return _length;
- }
- }
write word
- public boolean writeWordFile(String path, String content) {
- boolean w = false;
- try {
- // byte b[] = content.getBytes("ISO-8859-1");
- byte b[] = content.getBytes();
- ByteArrayInputStream bais = new ByteArrayInputStream(b);
- POIFSFileSystem fs = new POIFSFileSystem();
- DirectoryEntry directory = fs.getRoot();
- DocumentEntry de = directory.createDocument("WordDocument", bais);
- FileOutputStream ostream = new FileOutputStream(path);
- fs.writeFilesystem(ostream);
- bais.close();
- ostream.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- return w;
- }
下载地址: http://www.apache.org/dist/jakarta/Poi/
<!--age contentType="text/html; charset=GBK" import="java.io.*,org.apache.poi.hwpf.HWPFDocument,org.apache.poi.hwpf.usermodel.*,org.apache.poi.hwpf.model.*"-->
<!--r /> HWPFDocument doc = new HWPFDocument(new FileInputStream("g://a.doc"));<br />
<!--r /> if (text.trim ().length () == 0)<br />
java2word 是一个在java程序中调用 MS Office Word 文档的组件(类库)。该组件提供了一组简单的接口,以便java程序调用他的服务操作Word 文档。
更多激动人心的功能见详细说明: http://www.heavenlake.com/java2word/doc
3. 用java生成word文档
- java
后来发现java支持rtf格式的文档, word也支持, 于是乎便使用此产生word文档了. 呵呵..
java支持的rtf文档功能不是很强大, 我们可以借助于一些开源的库, 比如: itext就可以很好的支持. itext上有很多例子, 有兴趣的可以上去看一下, 这里就不摘录了.
但是itext比较大要1.4M, 不是很喜欢. 在sf上找来找去, 发现一个更小的库, 尽管功能不是很强大, 基本的功能都有, 他就是srw(Simple RTF Writer目前它的版本是0.6,好久都没有人维护了).
srw内置了很多例子, 例如: 我们要写一个简单的rtf, 我们只需要这么写:
public class TestSimpleRtf {
private static final String FILE_NAME = "out_testsimplertf.rtf";
public static void main(String[] args) {
try {
// RTF Dokument generieren (in Querformat)
RTFDocument doc = new RTFDocument(PageSize.DIN_A4_QUER);
// Anzeige-Zoom und Ansicht definieren
doc.setViewscale(RTFDocument.VIEWSCALE_FULLPAGE); // Anzeige-Zoom auf "komplette Seite" einstellen
doc.setViewkind(RTFDocument.VIEWKIND_PAGELAYOUT); // ViewMode auf Seitenlayout stellen
Paragraph absatz = new Paragraph(18, 0, 16, Font.ARIAL, new TextPart("Simple RTF Writer Testdokument"));
File savefile = new File(FILE_NAME);
System.out.println("Neues RTF Dokument erstellt: " + savefile.getAbsolutePath());
} catch (IOException e) {
用法很简单, 但是功能很少, 比如没有table的功能, 不能设置打印方向等问题. 不过这个基本上就够用了.
后来, 我们的项目要求横向打印, 这可难坏了. 没办法, 自己查找word的rtf格式库, 拓展横向打印功能, 目前已经完成...
import com.itseasy.rtf.RTFDocument;
import com.itseasy.rtf.text.PageSize;
public class MyRTFDocument extends RTFDocument {
public static final int ORIENTATION_PORTRAIT = 0;
public static final int ORIENTATION_LANDSCAPE = 1;
private int orientation;
public MyRTFDocument() {
* @param arg0
public MyRTFDocument(PageSize arg0) {
/* (non-Javadoc)
* @see com.itseasy.rtf.RTFDocument#getDocumentAsString()
protected String getDocumentAsString() {
StringBuffer sb = new StringBuffer(super.getDocumentAsString());
int pos = -1;
if (ORIENTATION_LANDSCAPE == orientation) {
pos = sb.indexOf("paperw");
if (pos > 0) {
sb.insert(pos, "lndscpsxn");
pos = 0;
while((pos = sb.indexOf("pardplain", pos)) > 0){
pos = sb.indexOf("{", pos);
sb.insert(pos, "dbchaf2");
return sb.toString();
* @return Returns the orientation.
public int getOrientation() {
return orientation;
* @param orientation The orientation to set.
public void setOrientation(int orientation) {
this.orientation = orientation;
* WordBridge.java
package com.kela.util;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.PreparedStatement;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.jacob.activeX.ActiveXComponent;
import com.jacob.com.Dispatch;
import com.jacob.com.Variant;
import com.kela.db.PoolingDataSource;
* 说明: 对word的操作 <p>
* @author kela.kf@gmail.com
public class WordBridge {
Log log = LogFactory.getLog("WordBridgt");
private ActiveXComponent MsWordApp = null;
private Dispatch document = null;
* 打开word
* @param makeVisible, true显示word, false不显示word
public void openWord(boolean makeVisible) {
if (MsWordApp == null) {
MsWordApp = new ActiveXComponent("Word.Application");
Dispatch.put(MsWordApp, "Visible", new Variant(makeVisible));
* 创建新的文档
public void createNewDocument() {
Dispatch documents = Dispatch.get(MsWordApp, "Documents").toDispatch();
document = Dispatch.call(documents, "Add").toDispatch();
* 关闭文档
public void closeDocument() {
// 0 = wdDoNotSaveChanges
// -1 = wdSaveChanges
// -2 = wdPromptToSaveChanges
Dispatch.call(document, "Close", new Variant(0));
document = null;
* 关闭word
public void closeWord() {
Dispatch.call(MsWordApp, "Quit");
MsWordApp = null;
document = null;
* 插入文本
* @param textToInsert 文本内容
public void insertText(String textToInsert) {
Dispatch selection = Dispatch.get(MsWordApp, "Selection").toDispatch();
Dispatch.put(selection, "Text", textToInsert);
* 保存文件
* @param filename
public void saveFileAs(String filename) {
Dispatch.call(document, "SaveAs", filename);
* 将word转换成html
* @param htmlFilePath
public void wordToHtml(String htmlFilePath) {
Dispatch.invoke(document,"SaveAs", Dispatch.Method, new Object[]{htmlFilePath,new Variant(8)}, new int[1]);
* 保存word的同时,保存一个html
* @param text 需要保存的内容
* @param wordFilePath word的路径
* @param htmlFilePath html的路径
* @throws LTOAException
public void wordAsDbOrToHtml(String text, String wordFilePath, String htmlFilePath) throws LTOAException {
try {
} catch (Exception ex) {
log.error("错误 - 对word的操作发生错误");
log.error("原因 - " + ex.getMessage());
throw new LTOAException(LTOAException.ERR_UNKNOWN, "对word的操作发生错误("
+ this.getClass().getName() + ".wordAsDbOrToHtml())", ex);
} finally {
* 将word保存至数据库
* @param wordFilePath
* @param RecordID
* @throws LTOAException
public void wordAsDatabase(String wordFilePath, String RecordID) throws LTOAException {
Connection conn = null;
PreparedStatement pstmt = null;
PoolingDataSource pool = null;
File file = null;
String sql = "";
try {
sql = " UPDATE Document_File SET FileBody = ? WHERE RecordID = ? ";
pool = new PoolingDataSource();
conn = pool.getConnection();
file = new File(wordFilePath);
InputStream is = new FileInputStream(file);
byte[] blobByte = new byte[is.available()];
pstmt = conn.prepareStatement(sql);
pstmt.setBinaryStream(1,(new ByteArrayInputStream(blobByte)), blobByte.length);
pstmt.setString(2, RecordID);
} catch (Exception ex) {
log.error("错误 - 表 Document_File 更新数据发生意外错误");
log.error("原因 - " + ex.getMessage());
throw new LTOAException(LTOAException.ERR_UNKNOWN,
+ this.getClass().getName() + ".wordAsDatabase())", ex);
} finally {
* 得到一个唯一的编号
* @return 编号
public String getRecordID() {
String sRecordID = "";
java.util.Date dt=new java.util.Date();
long lg=dt.getTime();
Long ld=new Long(lg);
sRecordID =ld.toString();
return sRecordID;
* 得到保存word和html需要的路径
* @param systemType 模块类型 givInfo, sw, fw
* @param fileType 文件类型 doc, html
* @param recID 文件编号
* @return 路径
public String getWordFilePath(String systemType, String fileType, String recID) {
String filePath = "";
File file = new File(this.getClass().getResource("/").getPath());
filePath = file.getPath().substring(0, file.getPath().length() - 15);
if(systemType.equalsIgnoreCase("govInfo")) {
filePath = filePath + "/uploadFiles/govInfo/document/" + recID + ".doc";
else if(fileType.equalsIgnoreCase("htm"))
filePath = filePath + "/HTML/govInfo/" + recID + ".htm";
} else if(systemType.equalsIgnoreCase("sw")){
filePath = filePath + "/uploadFiles/sw/document/" + recID + ".doc";
else if(fileType.equalsIgnoreCase("htm"))
filePath = filePath + "/HTML/sw/" + recID + ".htm";
} else if(systemType.equalsIgnoreCase("fw")) {
filePath = filePath + "/uploadFiles/fw/document/" + recID + ".doc";
else if(fileType.equalsIgnoreCase("htm"))
filePath = filePath + "/HTML/fw/" + recID + ".htm";
return filePath;
5. 另一个例子(用jacob包):
jacob.dll是和com 交互的东西,我们需要把它放入windows/system32中,而且在path中要指明它的位置。
import com.jacob.com.*;
import com.jacob.activeX.*;
public class ReplaceWord {
public static void main(String[] args) {
ActiveXComponent app = new ActiveXComponent("Word.Application"); //启动word
String inFile = "C://test.doc"; //要替换的word文件
boolean flag = false;
try {
app.setProperty("Visible", new Variant(false)); //设置word不可见
Object docs = app.getProperty("Documents").toDispatch();
Object doc = Dispatch.invoke(docs, "Open", Dispatch.Method, new Object[]{inFile, new Variant(false), new Variant(false)}, new int[1]).toDispatch(); //打开word文件,注意这里第三个参数要设为false,这个参数表示是否以只读方式打开,因为我们要保存原文件,所以以可写方式打开。
Object content = Dispatch.get(doc, "Content").toDispatch(); //提取word文档内容对象
Object finder = Dispatch.get(content, "Find").toDispatch(); //提取find对象,也就查找替换的那个对象
Variant f = new Variant(false);
boolean rt = true;
rt = Dispatch.invoke(finder, "Execute", Dispatch.Method, new Object[] {"New", f, f, f, f, f, f, f, f, "Old", new Variant(true)}, new int[1]).toBoolean(); //替换Old ---> New
Dispatch.call(doc, "Save"); //保存
Dispatch.call(doc, "Close", f);
flag = true;
System.out.println("is over");
catch (Exception e){
finally {
app.invoke("Quit", new Variant[] {});
