java读取word-excel-ppt文件代码

程序员文章站 2023-11-25 10:29:04

word: import org.apache.lucene.document.document; import org.apache.lucene.document.fi...

word:
import org.apache.lucene.document.document;
import org.apache.lucene.document.field;
import org.apache.poi.hwpf.extractor.wordextractor;
import java.io.file;
import java.io.inputstream;
import java.io.fileinputstream;
import com.search.code.index;
public document getdocument(index index, string url, string title, inputstream is) throws doccenterexception {
string bodytext = null;
try {
wordextractor ex = new wordextractor(is);//is是word文件的inputstream
bodytext = ex.gettext();
if(!bodytext.equals("")){
index.addindex(url, title, bodytext);
}
}catch (doccenterexception e) {
throw new doccenterexception("无法从该mocriosoft word文档中提取内容", e);
}catch(exception e){
e.printstacktrace();
}
}
return null;
}
excel:
import org.apache.lucene.document.document;
import org.apache.lucene.document.field;
import org.apache.poi.hwpf.extractor.wordextractor;
import org.apache.poi.hssf.usermodel.hssfworkbook;
import org.apache.poi.hssf.usermodel.hssfsheet;
import org.apache.poi.hssf.usermodel.hssfrow;
import org.apache.poi.hssf.usermodel.hssfcell;
import java.io.file;
import java.io.inputstream;
import java.io.fileinputstream;
import com.search.code.index;
public document getdocument(index index, string url, string title, inputstream is) throws doccenterexception {
stringbuffer content = new stringbuffer();
try{
hssfworkbook workbook = new hssfworkbook(is);//创建对excel工作簿文件的引用
for (int numsheets = 0; numsheets < workbook.getnumberofsheets(); numsheets++) {
if (null != workbook.getsheetat(numsheets)) {
hssfsheet asheet = workbook.getsheetat(numsheets);//获得一个sheet
for (int rownumofsheet = 0; rownumofsheet <= asheet.getlastrownum(); rownumofsheet++) {
if (null != asheet.getrow(rownumofsheet)) {
hssfrow arow = asheet.getrow(rownumofsheet); //获得一个行
for (short cellnumofrow = 0; cellnumofrow <= arow.getlastcellnum(); cellnumofrow++) {
if (null != arow.getcell(cellnumofrow)) {
hssfcell acell = arow.getcell(cellnumofrow);//获得列值
content.append(acell.getstringcellvalue());
}
}
}
}
}
}
if(!content.equals("")){
index.addindex(url, title, content.tostring());
}
}catch (doccenterexception e) {
throw new doccenterexception("无法从该mocriosoft word文档中提取内容", e);
}catch(exception e) {
system.out.println("已运行xlread() : " + e );
}
return null;
}
powerpoint:
import java.io.inputstream;
import org.apache.lucene.document.document;
import org.apache.poi.hslf.hslfslideshow;
import org.apache.poi.hslf.model.textrun;
import org.apache.poi.hslf.model.slide;
import org.apache.poi.hslf.usermodel.slideshow;
public document getdocument(index index, string url, string title, inputstream is)
throws doccenterexception {
stringbuffer content = new stringbuffer("");
try{
slideshow ss = new slideshow(new hslfslideshow(is));//is 为文件的inputstream，建立slideshow
slide[] slides = ss.getslides();//获得每一张幻灯片
for(int i=0;i<slides.length;i++){
textrun[] t = slides[i].gettextruns();//为了取得幻灯片的文字内容，建立textrun
for(int j=0;j<t.length;j++){
content.append(t[j].gettext());//这里会将文字内容加到content中去
}
content.append(slides[i].gettitle());
}
index.addindex(url, title, content.tostring());
}catch(exception ex){
system.out.println(ex.tostring());
}
return null;
}
pdf:
import java.io.inputstream;
import java.io.ioexception;
import org.apache.lucene.document.document;
import org.pdfbox.cos.cosdocument;
import org.pdfbox.pdfparser.pdfparser;
import org.pdfbox.pdmodel.pddocument;
import org.pdfbox.pdmodel.pddocumentinformation;
import org.pdfbox.util.pdftextstripper;
import com.search.code.index;
public document getdocument(index index, string url, string title, inputstream is)throws doccenterexception {
cosdocument cosdoc = null;
try {
cosdoc = parsedocument(is);
} catch (ioexception e) {
closecosdocument(cosdoc);
throw new doccenterexception("无法处理该pdf文档", e);
}
if (cosdoc.isencrypted()) {
if (cosdoc != null)
closecosdocument(cosdoc);
throw new doccenterexception("该pdf文档是加密文档，无法处理");
}
string doctext = null;
try {
pdftextstripper stripper = new pdftextstripper();
doctext = stripper.gettext(new pddocument(cosdoc));
} catch (ioexception e) {
closecosdocument(cosdoc);
throw new doccenterexception("无法处理该pdf文档", e);
}
pddocument pddoc = null;
try {
pddoc = new pddocument(cosdoc);
pddocumentinformation docinfo = pddoc.getdocumentinformation();
if(docinfo.gettitle()!=null && !docinfo.gettitle().equals("")){
title = docinfo.gettitle();
}
} catch (exception e) {
closecosdocument(cosdoc);
closepddocument(pddoc);
system.err.println("无法取得该pdf文档的元数据" + e.getmessage());
} finally {
closecosdocument(cosdoc);
closepddocument(pddoc);
}
return null;
}
private static cosdocument parsedocument(inputstream is) throws ioexception {
pdfparser parser = new pdfparser(is);
parser.parse();
return parser.getdocument();
}
private void closecosdocument(cosdocument cosdoc) {
if (cosdoc != null) {
try {
cosdoc.close();
} catch (ioexception e) {
}
}
}
private void closepddocument(pddocument pddoc) {
if (pddoc != null) {
try {
pddoc.close();
} catch (ioexception e) {
}
}
}
代码复制可能出错，不过代码经过测试，绝对能用，poi为3.0-rc4，pdfbox为0.7.3

poi: http://jakarta.apache.org/poi/index.html
pdfbox:

上一篇： Android Activity之间传递图片(Bitmap)的方法

下一篇：学经济应该选财经类的大学还是985 211？附财经大学211院校推荐

java读取word-excel-ppt文件代码

Java获取网络文件并插入数据库的代码

ASP.NET中读取XML文件信息的4种方法与示例代码

Java IO读取文件的实例详解

java读取word-excel-ppt文件代码

java多线程复制文件的实例代码

java实现大文件分割与合并的实例代码

Java实现读取键盘输入保存到txt文件,再统计并输出每个单词出现次数的方法

java大文件上传插件（java课程设计题目及代码）

java大文件上传插件（java课程设计题目及代码）

asp读取远程文件并保存到本地代码