Extracting text from PDF is not as simple as reading text from a text(.txt) file. One has to use suitable libraries to extract data from any pdf file. For example, Below code snippet shows one of the way to extract text or images from PDF file using Apache's PDFBox libraries.
NOTE: The below code snippet is using Apache's PDFBox Version 2.0.4. (Click here to download)
/** Extract Complete Text From PDF **/
public void getCompleteTextFromPdf(String pdfFilePath) throws IOException{
String pdfText = "";
PDDocument pddoc = PDDocument.load(new File(pdfFilePath));
PDFTextStripper pdfstripper = new PDFTextStripper();
pdfText = pdfstripper.getText(pddoc);
pddoc.close();
System.out.println("Complete Text: \n" + pdfText);
}/** Extract Text from particular region in PDF **/
public void getTextFromPdfRegion(String pdfFilePath) throws IOException{
String pdfText = "";
PDDocument pddoc = PDDocument.load(new File(pdfFilePath));
PDPage page1 = pddoc.getPage(0);
PDFTextStripperByArea pdfareaStripper = new PDFTextStripperByArea();
pdfareaStripper.setSortByPosition(true);
Rectangle rectangle = new Rectangle(10, 50, 300, 100);
pdfareaStripper.addRegion("TestRegion", rectangle);
pdfareaStripper.extractRegions(page1);
pdfText = pdfareaStripper.getTextForRegion("TestRegion");
pddoc.close();
System.out.println("Region Text: \n" + pdfText);
}
/** Extract All Images in a PDF & save them locally **/
public static void getImagesFromPdf(String pdfFilePath) throws InvalidPasswordException, IOException{
PDDocument pddoc = PDDocument.load(new File(pdfFilePath));
PDPageTree pages = pddoc.getPages();
String path = System.getProperty("user.dir");
for(PDPage page : pages){
PDResources rs = page.getResources();
for (COSName cosObj : rs.getXObjectNames()) {
PDXObject obj = rs.getXObject(cosObj);
if (obj instanceof PDImageXObject) {
ImageIO.write(((PDImageXObject) obj).getImage(), "png", new File(path+"/images/Image"+System.currentTimeMillis() +".png"));
}
}
}
}
SAMPLE PDF FILES:
No comments:
Post a Comment