Java解析PDF获取文本和图片信息

1.需要引用的jar

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.apache.pdfbox.util.PDFTextStripper;

2…主要方法

 public static void getPdfText(InputStream inputStream) throws Exception {
   // 是否排序
        boolean sort = false;
        // 编码方式
        String encoding = "UTF-8";
        // 开始提取页数
        int startPage = 1;
        // 内存中存储的PDF Document
        PDDocument pdDocument = null;
        StringBuffer sb = new StringBuffer();
         try {
            //获取图片
            List<PDPage> pages = pdDocument.getDocumentCatalog().getAllPages();
            Iterator<PDPage> iter = pages.iterator();
            while (iter.hasNext()) {
                PDPage page = (PDPage) iter.next();
                PDResources resources = page.getResources();
                Map<String, PDXObjectImage> images = resources.getImages();
                if (images != null) {
                    Iterator<String> imageIter = images.keySet().iterator();
                    while (imageIter.hasNext()) {
                        String key = (String) imageIter.next();
                        PDXObjectImage image = (PDXObjectImage) images.get(key);
                        if (!ObjectUtils.isEmpty(image.getRGBImage())) {
                            int width = image.getRGBImage().getWidth();
                            int height = image.getRGBImage().getHeight();
                                imageInputStream.close();
                                imageInputStream = null;
                                imageTextVo = null;
                            }
                        }
                    }
                }
            }


            // 获取页码
            int endPage = pdDocument.getNumberOfPages();
            PDFTextStripper stripper = null;
            stripper = new PDFTextStripper();
            // 设置是否排序
            stripper.setSortByPosition(sort);
            // 设置起始页
            stripper.setStartPage(startPage);
            // 设置结束页
            stripper.setEndPage(endPage);
            sb.append(stripper.getText(pdDocument));
            vo.setText(sb.toString());
            vo.setTotalPage(((endPage + 1) - startPage)); //最后一页+1,然后减去开始页
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } finally {
            if (inputStream != null) {
                // 关闭输出流
                inputStream.close();
            }
            if (pdDocument != null) {
                // 关闭PDF Document
                pdDocument.close();
            }
        }
 }

版权声明:本文为qq_23140197原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。