图片中的是常用的pdf解析方法和应用程序,一般来说扫描版的pdf基本上是无法解析的,至少上面的这几种方法都不行。
上代码:
import com.itextpdf.text.exceptions.UnsupportedPdfException;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PdfReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
/**
* @author iText
*/
public class ExtractStreams {
public static final String SRC = "D:\\Test.pdf";
public static final String DEST = "D:\\result\\yuanyuhu";
public static void main(String[] args) throws IOException {
File file = new File(DEST);
file.getParentFile().mkdirs();
new ExtractStreams().parse(SRC, DEST);
}
public void parse(String src, String dest) throws IOException {
PdfReader reader = new PdfReader(src);
PdfObject obj;
for (int i = 1; i <= reader.getXrefSize(); i++) {
obj = reader.getPdfObject(i);
if (obj != null && obj.isStream()) {
PRStream stream = (PRStream)obj;
byte[] b;
try {
b = PdfReader.getStreamBytes(stream);
}
catch(UnsupportedPdfException e) {
b = PdfReader.getStreamBytesRaw(stream);
}
FileOutputStream fos = new FileOutputStream(String.format(dest, i));
fos.write(b);
fos.flush();
fos.close();
}
}
}
}public class ParseCustom {
public static final String SRC = "D:\\找工作经历反馈.pdf";
class FontRenderFilter extends RenderFilter {
public boolean allowText(TextRenderInfo renderInfo) {
// String font = renderInfo.getFont().getPostscriptFontName();
return true;
// return font.endsWith("Bold") || font.endsWith("Oblique");
}
}
public static void main(String[] args) throws IOException, DocumentException {
new ParseCustom().parse(SRC);
}
public void parse(String filename) throws IOException {
PdfReader reader = new PdfReader(filename);
FontRenderFilter fontFilter = new FontRenderFilter();
TextExtractionStrategy strategy = new FilteredTextRenderListener(
new LocationTextExtractionStrategy(), fontFilter);
for (int i = 1; i <= reader.getNumberOfPages(); i++)
System.out.println(PdfTextExtractor.getTextFromPage(reader, i, strategy));
reader.close();
}
}import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
public class ParseCzech {
public static final String DEST = "D:\\test\\test.txt";
public static final String SRC = "D:\\test\\test.pdf";
// public static final String SRC = "D:\\test\\论文初稿.pdf";
@BeforeClass
public static void beforeClass() throws IOException {
File file = new File(DEST);
file.getParentFile().mkdirs();
new ParseCzech().manipulatePdf();
}
@Test
public void manipulatePdf() throws IOException {
PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));
FileOutputStream fos = new FileOutputStream(DEST);
LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
parser.processPageContent(pdfDoc.getFirstPage());
byte[] array = strategy.getResultantText().getBytes("UTF-8");
fos.write(array);
fos.flush();
fos.close();
pdfDoc.close();
Assert.assertEquals(67, array.length);
}
}
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDFReaderTest {
/**
* simply reader all the text from a pdf file. You have to deal with the
* format of the output text by yourself. 2008-2-25
*
* @param pdfFilePath
* file path
* @return all text in the pdf file
*/
public static String getTextFromPDF(String pdfFilePath) {
String result = null;
FileInputStream is = null;
PDDocument document = null;
try {
RandomAccessRead randomAccessRead = new RandomAccessBufferedFileInputStream(pdfFilePath);
PDFParser parser = new PDFParser(randomAccessRead);
parser.parse();
document = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(document);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if (document != null) {
try {
document.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
return result;
}
public static void main(String[] args) {
String str = PDFReaderTest.getTextFromPDF("D:\\test.pdf");
System.out.println(str);
}
}import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDFReaderTestOne {
public void readFdf(String filePath) throws Exception {
// 是否排序
boolean sort = false;
// pdf文件名
String pdfFile = filePath;
String textFile = null;
String encoding = "UTF-8";
int startPage = 1;
int endPage = Integer.MAX_VALUE;
Writer output = null;
PDDocument document = null;
try {
File file = new File(pdfFile);
document = PDDocument.load(file);
String fileName = file.getName();
System.out.println(fileName);
if (fileName.length() > 4) {
File outputFile = new File(fileName.substring(0, fileName.length() - 4) + ".txt");
textFile = outputFile.getName();
System.out.println(textFile);
System.out.println(outputFile.getAbsolutePath());
}
output = new OutputStreamWriter(new FileOutputStream(textFile), encoding);
PDFTextStripper stripper = null;
stripper = new PDFTextStripper();
stripper.setSortByPosition(sort);
stripper.setStartPage(startPage);
stripper.setEndPage(endPage);
stripper.writeText(document, output);
} finally {
if (output != null) {
output.close();
}
if (document != null) {
document.close();
}
}
}
public static void main(String[] args) {
// TODO Auto-generated method stub
PDFReaderTestOne pdfReader = new PDFReaderTestOne();
try {
pdfReader.readFdf("d://Test.pdf");
} catch (Exception e) {
e.printStackTrace();
}
}
}import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.text.PDFTextStripper;
/**
* @author 作者 : YUHU YUAN
* @date 创建时间:2016年11月29日 上午11:46:07
* @version 1.0
*/
public class PDFReaderTestTwo {
public static void main(String[] args) throws Exception {
RandomAccessRead randomAccessRead = new RandomAccessBufferedFileInputStream("d://2016MeshTree.pdf");
PDFParser parser = new PDFParser(randomAccessRead);
parser.parse();
PDFTextStripper ts = new PDFTextStripper();
String s = ts.getText(parser.getPDDocument());
System.out.println(s);
}
}import com.itextpdf.io.font.FontConstants;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.font.PdfFontFactory;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.kernel.pdf.canvas.PdfCanvas;
import com.itextpdf.kernel.pdf.xobject.PdfFormXObject;
import com.itextpdf.layout.Document;
import com.itextpdf.layout.element.Image;
import com.itextpdf.layout.element.Paragraph;
import java.io.File;
public class WriteOnFirstPage {
public static final String DEST
= "D:/test11.pdf";
public static void main(String[] args) throws Exception {
File file = new File(DEST);
file.getParentFile().mkdirs();
new WriteOnFirstPage().manipulatePdf(DEST);
}
protected void manipulatePdf(String dest) throws Exception {
PdfDocument pdfDoc = new PdfDocument(new PdfWriter(DEST));
Document doc = new Document(pdfDoc);
PdfFormXObject template = new PdfFormXObject(new Rectangle(523, 50));
PdfCanvas templateCanvas = new PdfCanvas(template, pdfDoc);
doc.add(new Image(template));
for (int i = 0; i < 100; i++) {
doc.add(new Paragraph("test"));
}
templateCanvas
.beginText()
.setFontAndSize(PdfFontFactory.createFont(FontConstants.HELVETICA), 12)
.showText(String.format("There are %s pages in this document", pdfDoc.getNumberOfPages()))
.endText()
.stroke();
doc.close();
}
}上面的代码可能有写乱,但是都经过测试能用,可以自己整理一下。
版权声明:本文为yuhushangwei原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。