1.需要引用的jar
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.apache.pdfbox.util.PDFTextStripper;
2…主要方法
public static void getPdfText(InputStream inputStream) throws Exception {
// 是否排序
boolean sort = false;
// 编码方式
String encoding = "UTF-8";
// 开始提取页数
int startPage = 1;
// 内存中存储的PDF Document
PDDocument pdDocument = null;
StringBuffer sb = new StringBuffer();
try {
//获取图片
List<PDPage> pages = pdDocument.getDocumentCatalog().getAllPages();
Iterator<PDPage> iter = pages.iterator();
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
PDResources resources = page.getResources();
Map<String, PDXObjectImage> images = resources.getImages();
if (images != null) {
Iterator<String> imageIter = images.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage image = (PDXObjectImage) images.get(key);
if (!ObjectUtils.isEmpty(image.getRGBImage())) {
int width = image.getRGBImage().getWidth();
int height = image.getRGBImage().getHeight();
imageInputStream.close();
imageInputStream = null;
imageTextVo = null;
}
}
}
}
}
// 获取页码
int endPage = pdDocument.getNumberOfPages();
PDFTextStripper stripper = null;
stripper = new PDFTextStripper();
// 设置是否排序
stripper.setSortByPosition(sort);
// 设置起始页
stripper.setStartPage(startPage);
// 设置结束页
stripper.setEndPage(endPage);
sb.append(stripper.getText(pdDocument));
vo.setText(sb.toString());
vo.setTotalPage(((endPage + 1) - startPage)); //最后一页+1,然后减去开始页
} catch (MalformedURLException e) {
e.printStackTrace();
} finally {
if (inputStream != null) {
// 关闭输出流
inputStream.close();
}
if (pdDocument != null) {
// 关闭PDF Document
pdDocument.close();
}
}
}
版权声明:本文为qq_23140197原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。