java/应用程序,解析pdf的几种不同方式

这里写图片描述

图片中的是常用的pdf解析方法和应用程序,一般来说扫描版的pdf基本上是无法解析的,至少上面的这几种方法都不行。
上代码:

import com.itextpdf.text.exceptions.UnsupportedPdfException;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PdfReader;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;

/**
 * @author iText
 */
public class ExtractStreams {
    public static final String SRC = "D:\\Test.pdf";
    public static final String DEST = "D:\\result\\yuanyuhu";

    public static void main(String[] args) throws IOException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new ExtractStreams().parse(SRC, DEST);
    }

    public void parse(String src, String dest) throws IOException {
        PdfReader reader = new PdfReader(src);
        PdfObject obj;
        for (int i = 1; i <= reader.getXrefSize(); i++) {
            obj = reader.getPdfObject(i);
            if (obj != null && obj.isStream()) {
                PRStream stream = (PRStream)obj;
                byte[] b;
                try {
                    b = PdfReader.getStreamBytes(stream);
                }
                catch(UnsupportedPdfException e) {
                    b = PdfReader.getStreamBytesRaw(stream);
                }
                FileOutputStream fos = new FileOutputStream(String.format(dest, i));
                fos.write(b);
                fos.flush();
                fos.close();
            }
        }
    }
}
public class ParseCustom {

   public static final String SRC = "D:\\找工作经历反馈.pdf";

   class FontRenderFilter extends RenderFilter {
       public boolean allowText(TextRenderInfo renderInfo) {
//           String font = renderInfo.getFont().getPostscriptFontName();
           return true;
//           return font.endsWith("Bold") || font.endsWith("Oblique");
       }
   }

   public static void main(String[] args) throws IOException, DocumentException {
       new ParseCustom().parse(SRC);
   }

   public void parse(String filename) throws IOException {
       PdfReader reader = new PdfReader(filename);
       FontRenderFilter fontFilter = new FontRenderFilter();
       TextExtractionStrategy strategy = new FilteredTextRenderListener(
               new LocationTextExtractionStrategy(), fontFilter);

       for (int i = 1; i <= reader.getNumberOfPages(); i++) 
           System.out.println(PdfTextExtractor.getTextFromPage(reader, i, strategy));

       reader.close();
   }
}
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;

public class ParseCzech {
   public static final String DEST = "D:\\test\\test.txt";
   public static final String SRC = "D:\\test\\test.pdf";
//   public static final String SRC = "D:\\test\\论文初稿.pdf";

   @BeforeClass
   public static void beforeClass() throws IOException {
       File file = new File(DEST);
       file.getParentFile().mkdirs();
       new ParseCzech().manipulatePdf();
   }

   @Test
   public void manipulatePdf() throws IOException {
       PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));
       FileOutputStream fos = new FileOutputStream(DEST);

       LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();

       PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
       parser.processPageContent(pdfDoc.getFirstPage());
       byte[] array = strategy.getResultantText().getBytes("UTF-8");
       fos.write(array);

       fos.flush();
       fos.close();

       pdfDoc.close();

       Assert.assertEquals(67, array.length);
   }
}
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;


public class PDFReaderTest {
    /**
     * simply reader all the text from a pdf file. You have to deal with the
     * format of the output text by yourself. 2008-2-25
     * 
     * @param pdfFilePath
     *            file path
     * @return all text in the pdf file
     */
    public static String getTextFromPDF(String pdfFilePath) {
        String result = null;
        FileInputStream is = null;
        PDDocument document = null;
        try {
            RandomAccessRead randomAccessRead = new RandomAccessBufferedFileInputStream(pdfFilePath);
            PDFParser parser = new PDFParser(randomAccessRead);
            parser.parse();
            document = parser.getPDDocument();
            PDFTextStripper stripper = new PDFTextStripper();
            result = stripper.getText(document);
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
            if (document != null) {
                try {
                    document.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
        return result;
    }

    public static void main(String[] args) {
        String str = PDFReaderTest.getTextFromPDF("D:\\test.pdf");
        System.out.println(str);

    }
}
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

public class PDFReaderTestOne {

    public void readFdf(String filePath) throws Exception {
        // 是否排序
        boolean sort = false;
        // pdf文件名
        String pdfFile = filePath;
        String textFile = null;
        String encoding = "UTF-8";
        int startPage = 1;
        int endPage = Integer.MAX_VALUE;
        Writer output = null;
        PDDocument document = null;
        try {
            File file = new File(pdfFile);
            document = PDDocument.load(file);
            String fileName = file.getName();

            System.out.println(fileName);

            if (fileName.length() > 4) {
                File outputFile = new File(fileName.substring(0, fileName.length() - 4) + ".txt");
                textFile = outputFile.getName();
                System.out.println(textFile);
                System.out.println(outputFile.getAbsolutePath());
            }
            output = new OutputStreamWriter(new FileOutputStream(textFile), encoding);
            PDFTextStripper stripper = null;
            stripper = new PDFTextStripper();
            stripper.setSortByPosition(sort);
            stripper.setStartPage(startPage);
            stripper.setEndPage(endPage);
            stripper.writeText(document, output);
        } finally {
            if (output != null) {
                output.close();
            }
            if (document != null) {
                document.close();
            }
        }
    }

    public static void main(String[] args) {
        // TODO Auto-generated method stub
        PDFReaderTestOne pdfReader = new PDFReaderTestOne();
        try {
            pdfReader.readFdf("d://Test.pdf");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.text.PDFTextStripper;

/**
 * @author 作者 : YUHU YUAN
 * @date 创建时间:2016年11月29日 上午11:46:07
 * @version 1.0
 */

public class PDFReaderTestTwo {
    public static void main(String[] args) throws Exception {
        RandomAccessRead randomAccessRead = new RandomAccessBufferedFileInputStream("d://2016MeshTree.pdf");
        PDFParser parser = new PDFParser(randomAccessRead);
        parser.parse();
        PDFTextStripper ts = new PDFTextStripper();
        String s = ts.getText(parser.getPDDocument());
        System.out.println(s);
    }

}
import com.itextpdf.io.font.FontConstants;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.font.PdfFontFactory;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.kernel.pdf.canvas.PdfCanvas;
import com.itextpdf.kernel.pdf.xobject.PdfFormXObject;
import com.itextpdf.layout.Document;
import com.itextpdf.layout.element.Image;
import com.itextpdf.layout.element.Paragraph;

import java.io.File;


public class WriteOnFirstPage  {
    public static final String DEST
            = "D:/test11.pdf";

    public static void main(String[] args) throws Exception {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        new WriteOnFirstPage().manipulatePdf(DEST);
    }

    protected void manipulatePdf(String dest) throws Exception {
        PdfDocument pdfDoc = new PdfDocument(new PdfWriter(DEST));
        Document doc = new Document(pdfDoc);
        PdfFormXObject template = new PdfFormXObject(new Rectangle(523, 50));
        PdfCanvas templateCanvas = new PdfCanvas(template, pdfDoc);

        doc.add(new Image(template));
        for (int i = 0; i < 100; i++) {
            doc.add(new Paragraph("test"));
        }

        templateCanvas
                .beginText()
                .setFontAndSize(PdfFontFactory.createFont(FontConstants.HELVETICA), 12)
                .showText(String.format("There are %s pages in this document", pdfDoc.getNumberOfPages()))
                .endText()
                .stroke();

        doc.close();
    }
}

上面的代码可能有写乱,但是都经过测试能用,可以自己整理一下。


版权声明:本文为yuhushangwei原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。