diff --git a/src/main/java/com/mindee/image/ImageExtractor.java b/src/main/java/com/mindee/image/ImageExtractor.java index 4b62e16ce..3309e160a 100644 --- a/src/main/java/com/mindee/image/ImageExtractor.java +++ b/src/main/java/com/mindee/image/ImageExtractor.java @@ -4,7 +4,8 @@ import com.mindee.geometry.PositionDataField; import com.mindee.input.InputSourceUtils; import com.mindee.input.LocalInputSource; -import com.mindee.pdf.PDFUtils; +import com.mindee.pdf.PDFBoxApi; +import com.mindee.pdf.PDFOperation; import com.mindee.pdf.PdfPageImage; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; @@ -21,29 +22,13 @@ public class ImageExtractor { private final String filename; private final String saveFormat; - /** - * Init from a path. - * - * @param filePath Path to the file. - * @throws IOException Throws if the file can't be accessed. - */ - public ImageExtractor(String filePath) throws IOException { - this(new LocalInputSource(filePath)); - } - - /** - * Init from a {@link LocalInputSource}. - * - * @param source The local source. - * @throws IOException Throws if the file can't be accessed. - */ - public ImageExtractor(LocalInputSource source) throws IOException { + public ImageExtractor(LocalInputSource source, PDFOperation pdfOperation) throws IOException { this.filename = source.getFilename(); this.pageImages = new ArrayList<>(); if (source.isPdf()) { this.saveFormat = "jpg"; - var pdfPageImages = PDFUtils.pdfToImages(source); + var pdfPageImages = pdfOperation.pdfToImages(source); for (PdfPageImage pdfPageImage : pdfPageImages) { this.pageImages.add(pdfPageImage.getImage()); } @@ -56,6 +41,16 @@ public ImageExtractor(LocalInputSource source) throws IOException { } } + /** + * Init from a {@link LocalInputSource}. + * + * @param source The local source. + * @throws IOException Throws if the file can't be accessed. + */ + public ImageExtractor(LocalInputSource source) throws IOException { + this(source, new PDFBoxApi()); + } + /** * Get the number of pages in the file. * diff --git a/src/main/java/com/mindee/input/LocalInputSource.java b/src/main/java/com/mindee/input/LocalInputSource.java index 46340ea72..32beff36b 100644 --- a/src/main/java/com/mindee/input/LocalInputSource.java +++ b/src/main/java/com/mindee/input/LocalInputSource.java @@ -4,8 +4,6 @@ import com.mindee.pdf.PDFBoxApi; import com.mindee.pdf.PDFCompressor; import com.mindee.pdf.PDFOperation; -import com.mindee.pdf.PDFUtils; -import com.mindee.pdf.SplitQuery; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -76,7 +74,7 @@ public int getPageCount() throws IOException { if (!this.isPdf()) { return 1; } - return PDFUtils.getNumberOfPages(this.file); + return getPdfOperation().getNumberOfPages(this.file); } /** @@ -87,7 +85,7 @@ public int getPageCount() throws IOException { */ public void applyPageOptions(PageOptions pageOptions) throws IOException { if (pageOptions != null && this.isPdf()) { - this.file = getPdfOperation().split(new SplitQuery(this.file, pageOptions)).getFile(); + this.file = getPdfOperation().split(this.file, pageOptions).getFile(); } } diff --git a/src/main/java/com/mindee/pdf/BasePDFExtractor.java b/src/main/java/com/mindee/pdf/BasePDFExtractor.java index fb2f62c5c..f9b6b21a8 100644 --- a/src/main/java/com/mindee/pdf/BasePDFExtractor.java +++ b/src/main/java/com/mindee/pdf/BasePDFExtractor.java @@ -1,17 +1,19 @@ package com.mindee.pdf; -import static com.mindee.pdf.PDFUtils.mergePdfPages; - import com.mindee.MindeeException; import com.mindee.input.InputSourceUtils; import com.mindee.input.LocalInputSource; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import javax.imageio.ImageIO; import org.apache.pdfbox.Loader; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; @@ -112,4 +114,64 @@ public List extractSubDocuments( } return extractedPDFs; } + + private static PDPage clonePage(PDPage page) { + + COSDictionary pageDict = page.getCOSObject(); + COSDictionary newPageDict = new COSDictionary(pageDict); + + newPageDict.removeItem(COSName.ANNOTS); + + return new PDPage(newPageDict); + } + + private static byte[] createPdfFromExistingPdf( + PDDocument document, + List pageNumbers, + boolean closeOriginal + ) throws IOException { + var outputStream = new ByteArrayOutputStream(); + var newDocument = new PDDocument(); + int pageCount = document.getNumberOfPages(); + pageNumbers + .stream() + .filter(i -> i < pageCount) + .forEach(i -> newDocument.addPage(clonePage(document.getPage(i)))); + + newDocument.save(outputStream); + newDocument.close(); + if (closeOriginal) { + document.close(); + } + + byte[] output = outputStream.toByteArray(); + outputStream.close(); + return output; + } + + /** + * Merge specified PDF pages together. + * + * @param file The PDF file. + * @param pageNumbers Lit of page numbers to merge together. + */ + public static byte[] mergePdfPages(File file, List pageNumbers) throws IOException { + PDDocument document = Loader.loadPDF(file); + return createPdfFromExistingPdf(document, pageNumbers, true); + } + + public static byte[] mergePdfPages( + PDDocument document, + List pageNumbers + ) throws IOException { + return mergePdfPages(document, pageNumbers, true); + } + + public static byte[] mergePdfPages( + PDDocument document, + List pageNumbers, + boolean closeOriginal + ) throws IOException { + return createPdfFromExistingPdf(document, pageNumbers, closeOriginal); + } } diff --git a/src/main/java/com/mindee/pdf/PDFBoxApi.java b/src/main/java/com/mindee/pdf/PDFBoxApi.java index 3d7f28425..6afcb8c69 100644 --- a/src/main/java/com/mindee/pdf/PDFBoxApi.java +++ b/src/main/java/com/mindee/pdf/PDFBoxApi.java @@ -2,6 +2,7 @@ import com.mindee.MindeeException; import com.mindee.input.PageOptions; +import java.awt.image.BufferedImage; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; @@ -13,6 +14,9 @@ import java.util.stream.IntStream; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.rendering.ImageType; +import org.apache.pdfbox.rendering.PDFRenderer; /** * Allows performing various operations on PDFs. @@ -20,21 +24,21 @@ public final class PDFBoxApi implements PDFOperation { @Override - public SplitPDF split(SplitQuery splitQuery) throws IOException { + public SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException { - if (!checkPdfOpen(splitQuery.getFile())) { + if (!checkPdfOpen(fileBytes)) { throw new MindeeException("This document cannot be open and cannot be split."); } - try (var originalDocument = Loader.loadPDF(splitQuery.getFile())) { + try (var originalDocument = Loader.loadPDF(fileBytes)) { try (var splitDocument = new PDDocument()) { - int totalOriginalPages = countPages(splitQuery.getFile()); + int totalOriginalPages = getNumberOfPages(fileBytes); - if (totalOriginalPages < splitQuery.getPageOptions().getOnMinPages()) { - return new SplitPDF(splitQuery.getFile(), totalOriginalPages); + if (totalOriginalPages < pageOptions.getOnMinPages()) { + return new SplitPDF(fileBytes, totalOriginalPages); } - var pageRange = getPageRanges(splitQuery.getPageOptions(), totalOriginalPages); + var pageRange = getPageRanges(pageOptions, totalOriginalPages); pageRange .stream() .filter(i -> i < totalOriginalPages) @@ -43,12 +47,65 @@ public SplitPDF split(SplitQuery splitQuery) throws IOException { try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { splitDocument.save(outputStream); byte[] splitPdf = outputStream.toByteArray(); - return new SplitPDF(splitPdf, countPages(splitPdf)); + return new SplitPDF(splitPdf, getNumberOfPages(splitPdf)); } } } } + @Override + public int getNumberOfPages(byte[] fileBytes) throws IOException { + var document = Loader.loadPDF(fileBytes); + int pageCount = document.getNumberOfPages(); + document.close(); + return pageCount; + } + + @Override + public PdfPageImage pdfPageToImage( + byte[] fileBytes, + String filename, + int pageNumber + ) throws IOException { + int index = pageNumber - 1; + PDDocument document = Loader.loadPDF(fileBytes); + var pdfRenderer = new PDFRenderer(document); + BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer); + document.close(); + return new PdfPageImage(imageBuffer, index, filename, "jpg"); + } + + @Override + public List pdfToImages(byte[] fileBytes, String filename) throws IOException { + PDDocument document = Loader.loadPDF(fileBytes); + var pdfRenderer = new PDFRenderer(document); + List pdfPageImages = new ArrayList<>(); + for (int i = 0; i < document.getNumberOfPages(); i++) { + var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); + pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg")); + } + document.close(); + return pdfPageImages; + } + + private BufferedImage pdfPageToImageBuffer( + int index, + PDDocument document, + PDFRenderer pdfRenderer + ) throws IOException { + PDRectangle bbox = document.getPage(index).getBBox(); + float dimension = bbox.getWidth() * bbox.getHeight(); + int dpi; + if (dimension < 200000) { + dpi = 300; + } else if (dimension < 300000) { + dpi = 250; + } else { + dpi = 200; + } + return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB); + } + private List getPageRanges(PageOptions pageOptions, Integer numberOfPages) { Set pages = Optional @@ -81,8 +138,4 @@ private boolean checkPdfOpen(byte[] documentFile) { } return opens; } - - private int countPages(byte[] documentFile) throws IOException { - return PDFUtils.getNumberOfPages(documentFile); - } } diff --git a/src/main/java/com/mindee/pdf/PDFCompressor.java b/src/main/java/com/mindee/pdf/PDFCompressor.java index b705ea929..d261106de 100644 --- a/src/main/java/com/mindee/pdf/PDFCompressor.java +++ b/src/main/java/com/mindee/pdf/PDFCompressor.java @@ -3,16 +3,25 @@ import static com.mindee.input.InputSourceUtils.hasSourceText; import static com.mindee.input.InputSourceUtils.isPdf; +import java.awt.*; import java.awt.image.BufferedImage; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.util.List; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; +import org.apache.pdfbox.pdmodel.graphics.color.PDColor; import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; /** * PDF compression class. @@ -60,7 +69,7 @@ public static byte[] compressPdf( ); } - byte[] docAsBytes = PDFUtils.documentToBytes(outputDoc); + byte[] docAsBytes = documentToBytes(outputDoc); outputDoc.close(); return docAsBytes; } @@ -82,6 +91,12 @@ public static byte[] compressPdf(byte[] pdfData) throws IOException { return compressPdf(pdfData, 85, false, true); } + private static byte[] documentToBytes(PDDocument document) throws IOException { + var outputStream = new ByteArrayOutputStream(); + document.save(outputStream); + return outputStream.toByteArray(); + } + private static void processPage( PDDocument originalDocument, Integer pageIndex, @@ -97,8 +112,100 @@ private static void processPage( var pdImage = JPEGFactory.createFromImage(outputDoc, image, imageQuality); try (var contentStream = new PDPageContentStream(outputDoc, newPage)) { - PDFUtils.addImageToPage(contentStream, pdImage, originalPageSize); - PDFUtils.extractAndAddText(originalDocument, contentStream, pageIndex, disableSourceText); + addImageToPage(contentStream, pdImage, originalPageSize); + extractAndAddText(originalDocument, contentStream, pageIndex, disableSourceText); + } + } + + private static void extractAndAddText( + PDDocument inputDoc, + PDPageContentStream contentStream, + int pageIndex, + boolean disableSourceText + ) throws IOException { + if (disableSourceText) { + return; } + + PDFTextStripper stripper = new PDFTextStripper() { + @Override + protected void writeString(String text, List textPositions) throws IOException { + if (textPositions.isEmpty()) { + return; + } + + TextPosition firstPosition = textPositions.get(0); + float fontSize = firstPosition.getFontSizeInPt(); + PDColor color = getGraphicsState().getNonStrokingColor(); + contentStream.beginText(); + contentStream.setFont(firstPosition.getFont(), fontSize); + contentStream.setNonStrokingColor(convertToAwtColor(color)); + + float x = firstPosition.getXDirAdj(); + float y = firstPosition.getPageHeight() - firstPosition.getYDirAdj(); + + contentStream.newLineAtOffset(x, y); + try { + contentStream.showText(text); + } catch (IllegalArgumentException | UnsupportedOperationException e) { + contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), fontSize); + contentStream.showText(text); + } + contentStream.endText(); + } + }; + + stripper.setStartPage(pageIndex + 1); + stripper.setEndPage(pageIndex + 1); + stripper.getText(inputDoc); } + + private static Color convertToAwtColor(PDColor pdColor) { + float[] components = pdColor.getComponents(); + if (components.length == 1) { + // Grayscale + return new Color(components[0], components[0], components[0]); + } else if (components.length == 3) { + // RGB + return new Color(components[0], components[1], components[2]); + } else if (components.length == 4) { + // CMYK (simplified conversion) + float c = components[0]; + float m = components[1]; + float y = components[2]; + float k = components[3]; + float r = 1 - Math.min(1, c + k); + float g = 1 - Math.min(1, m + k); + float b = 1 - Math.min(1, y + k); + return new Color(r, g, b); + } + return Color.BLACK; + } + + private static void addImageToPage( + PDPageContentStream contentStream, + PDImageXObject pdImage, + PDRectangle pageSize + ) throws IOException { + contentStream.drawImage(pdImage, 0, 0, pageSize.getWidth(), pageSize.getHeight()); + } + + private static BufferedImage pdfPageToImageBuffer( + int index, + PDDocument document, + PDFRenderer pdfRenderer + ) throws IOException { + PDRectangle bbox = document.getPage(index).getBBox(); + float dimension = bbox.getWidth() * bbox.getHeight(); + int dpi; + if (dimension < 200000) { + dpi = 300; + } else if (dimension < 300000) { + dpi = 250; + } else { + dpi = 200; + } + return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB); + } + } diff --git a/src/main/java/com/mindee/pdf/PDFOperation.java b/src/main/java/com/mindee/pdf/PDFOperation.java index 514307021..e9c0af58a 100644 --- a/src/main/java/com/mindee/pdf/PDFOperation.java +++ b/src/main/java/com/mindee/pdf/PDFOperation.java @@ -1,6 +1,9 @@ package com.mindee.pdf; +import com.mindee.input.LocalInputSource; +import com.mindee.input.PageOptions; import java.io.IOException; +import java.util.List; /** * Minimum PDF operations. @@ -9,9 +12,33 @@ public interface PDFOperation { /** * Split a PDF file. - * - * @param splitQuery Options to perform the query. - * @return The split PDF. */ - SplitPDF split(SplitQuery splitQuery) throws IOException; + SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException; + + /** + * Get the number of pages in a PDF file. + */ + int getNumberOfPages(byte[] fileBytes) throws IOException; + + default int getNumberOfPages(LocalInputSource inputSource) throws IOException { + return getNumberOfPages(inputSource.getFile()); + } + + /** + * Render a single page of a PDF as an image. + */ + PdfPageImage pdfPageToImage(byte[] fileBytes, String filename, int pageNumber) throws IOException; + + default PdfPageImage pdfPageToImage(LocalInputSource source, int pageNumber) throws IOException { + return pdfPageToImage(source.getFile(), source.getFilename(), pageNumber); + } + + /** + * Render all pages of a PDF as images. + */ + List pdfToImages(byte[] fileBytes, String filename) throws IOException; + + default List pdfToImages(LocalInputSource source) throws IOException { + return pdfToImages(source.getFile(), source.getFilename()); + } } diff --git a/src/main/java/com/mindee/pdf/PDFUtils.java b/src/main/java/com/mindee/pdf/PDFUtils.java deleted file mode 100644 index 760a0246f..000000000 --- a/src/main/java/com/mindee/pdf/PDFUtils.java +++ /dev/null @@ -1,306 +0,0 @@ -package com.mindee.pdf; - -import com.mindee.input.LocalInputSource; -import java.awt.Color; -import java.awt.image.BufferedImage; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.cos.COSDictionary; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDPageContentStream; -import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.pdmodel.font.PDType1Font; -import org.apache.pdfbox.pdmodel.font.Standard14Fonts; -import org.apache.pdfbox.pdmodel.graphics.color.PDColor; -import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; -import org.apache.pdfbox.rendering.ImageType; -import org.apache.pdfbox.rendering.PDFRenderer; -import org.apache.pdfbox.text.PDFTextStripper; -import org.apache.pdfbox.text.TextPosition; - -/** - * Utilities for working with PDFs. - */ -public final class PDFUtils { - - private PDFUtils() { - } - - /** - * Get the number of pages in the PDF. - * - * @param inputSource The PDF file. - */ - public static int getNumberOfPages(LocalInputSource inputSource) throws IOException { - return getNumberOfPages(inputSource.getFile()); - } - - /** - * Get the number of pages in the PDF. - * - * @param pdfBytes The PDF file as a byte array. - */ - public static int getNumberOfPages(byte[] pdfBytes) throws IOException { - PDDocument document = Loader.loadPDF(pdfBytes); - int pageCount = document.getNumberOfPages(); - document.close(); - return pageCount; - } - - private static PDPage clonePage(PDPage page) { - - COSDictionary pageDict = page.getCOSObject(); - COSDictionary newPageDict = new COSDictionary(pageDict); - - newPageDict.removeItem(COSName.ANNOTS); - - return new PDPage(newPageDict); - } - - private static byte[] createPdfFromExistingPdf( - PDDocument document, - List pageNumbers, - boolean closeOriginal - ) throws IOException { - var outputStream = new ByteArrayOutputStream(); - var newDocument = new PDDocument(); - int pageCount = document.getNumberOfPages(); - pageNumbers - .stream() - .filter(i -> i < pageCount) - .forEach(i -> newDocument.addPage(clonePage(document.getPage(i)))); - - newDocument.save(outputStream); - newDocument.close(); - if (closeOriginal) { - document.close(); - } - - byte[] output = outputStream.toByteArray(); - outputStream.close(); - return output; - } - - /** - * Merge specified PDF pages together. - * - * @param file The PDF file. - * @param pageNumbers Lit of page numbers to merge together. - */ - public static byte[] mergePdfPages(File file, List pageNumbers) throws IOException { - PDDocument document = Loader.loadPDF(file); - return createPdfFromExistingPdf(document, pageNumbers, true); - } - - public static byte[] mergePdfPages( - PDDocument document, - List pageNumbers - ) throws IOException { - return mergePdfPages(document, pageNumbers, true); - } - - public static byte[] mergePdfPages( - PDDocument document, - List pageNumbers, - boolean closeOriginal - ) throws IOException { - return createPdfFromExistingPdf(document, pageNumbers, closeOriginal); - } - - public static boolean isPdfEmpty(File file) throws IOException { - return checkIfPdfIsEmpty(Loader.loadPDF(file)); - } - - private static boolean checkIfPdfIsEmpty(PDDocument document) throws IOException { - boolean isEmpty = true; - for (PDPage page : document.getPages()) { - PDResources resources = page.getResources(); - if (resources == null) { - continue; - } - Iterable xObjects = resources.getXObjectNames(); - Iterable fonts = resources.getFontNames(); - - if ( - xObjects.spliterator().getExactSizeIfKnown() != 0 - || fonts.spliterator().getExactSizeIfKnown() != 0 - ) { - isEmpty = false; - break; - } - } - document.close(); - - return isEmpty; - } - - /** - * Render all pages of a PDF as images. - * Converting PDFs with hundreds of pages may result in a heap space error. - * - * @param filePath The path to the PDF file. - * @return List of all pages as images. - */ - public static List pdfToImages(String filePath) throws IOException { - return pdfToImages(new LocalInputSource(filePath)); - } - - /** - * Render all pages of a PDF as images. - * Converting PDFs with hundreds of pages may result in a heap space error. - * - * @param source The PDF file. - * @return List of all pages as images. - */ - public static List pdfToImages(LocalInputSource source) throws IOException { - PDDocument document = Loader.loadPDF(source.getFile()); - var pdfRenderer = new PDFRenderer(document); - List pdfPageImages = new ArrayList<>(); - for (int i = 0; i < document.getNumberOfPages(); i++) { - BufferedImage imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); - pdfPageImages.add(new PdfPageImage(imageBuffer, i, source.getFilename(), "jpg")); - } - document.close(); - return pdfPageImages; - } - - /** - * Render a single page of a PDF as an image. - * Main use case is for processing PDFs with hundreds of pages. - * If you need to only render some pages from the PDF, use mergePdfPages and then - * pdfToImages. - * - * @param filePath The path to the PDF file. - * @param pageNumber The page number to render, first page is 1. - * @return The page as an image. - */ - public static PdfPageImage pdfPageToImage(String filePath, int pageNumber) throws IOException { - return pdfPageToImage(new LocalInputSource(filePath), pageNumber); - } - - /** - * Render a single page of a PDF as an image. - * Main use case is for processing PDFs with hundreds of pages. - * If you need to only render some pages from the PDF, use mergePdfPages and - * then pdfToImages. - * - * @param source The PDF file. - * @param pageNumber The page number to render, first page is 1. - * @return The page as an image. - */ - public static PdfPageImage pdfPageToImage( - LocalInputSource source, - int pageNumber - ) throws IOException { - int index = pageNumber - 1; - PDDocument document = Loader.loadPDF(source.getFile()); - var pdfRenderer = new PDFRenderer(document); - BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer); - document.close(); - return new PdfPageImage(imageBuffer, index, source.getFilename(), "jpg"); - } - - private static BufferedImage pdfPageToImageBuffer( - int index, - PDDocument document, - PDFRenderer pdfRenderer - ) throws IOException { - PDRectangle bbox = document.getPage(index).getBBox(); - float dimension = bbox.getWidth() * bbox.getHeight(); - int dpi; - if (dimension < 200000) { - dpi = 300; - } else if (dimension < 300000) { - dpi = 250; - } else { - dpi = 200; - } - return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB); - } - - public static byte[] documentToBytes(PDDocument document) throws IOException { - var outputStream = new ByteArrayOutputStream(); - document.save(outputStream); - return outputStream.toByteArray(); - } - - public static void extractAndAddText( - PDDocument inputDoc, - PDPageContentStream contentStream, - int pageIndex, - boolean disableSourceText - ) throws IOException { - if (disableSourceText) { - return; - } - - PDFTextStripper stripper = new PDFTextStripper() { - @Override - protected void writeString(String text, List textPositions) throws IOException { - if (textPositions.isEmpty()) { - return; - } - - TextPosition firstPosition = textPositions.get(0); - float fontSize = firstPosition.getFontSizeInPt(); - PDColor color = getGraphicsState().getNonStrokingColor(); - contentStream.beginText(); - contentStream.setFont(firstPosition.getFont(), fontSize); - contentStream.setNonStrokingColor(convertToAwtColor(color)); - - float x = firstPosition.getXDirAdj(); - float y = firstPosition.getPageHeight() - firstPosition.getYDirAdj(); - - contentStream.newLineAtOffset(x, y); - try { - contentStream.showText(text); - } catch (IllegalArgumentException | UnsupportedOperationException e) { - contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), fontSize); - contentStream.showText(text); - } - contentStream.endText(); - } - }; - - stripper.setStartPage(pageIndex + 1); - stripper.setEndPage(pageIndex + 1); - stripper.getText(inputDoc); - } - - private static Color convertToAwtColor(PDColor pdColor) { - float[] components = pdColor.getComponents(); - if (components.length == 1) { - // Grayscale - return new Color(components[0], components[0], components[0]); - } else if (components.length == 3) { - // RGB - return new Color(components[0], components[1], components[2]); - } else if (components.length == 4) { - // CMYK (simplified conversion) - float c = components[0]; - float m = components[1]; - float y = components[2]; - float k = components[3]; - float r = 1 - Math.min(1, c + k); - float g = 1 - Math.min(1, m + k); - float b = 1 - Math.min(1, y + k); - return new Color(r, g, b); - } - return Color.BLACK; - } - - public static void addImageToPage( - PDPageContentStream contentStream, - PDImageXObject pdImage, - PDRectangle pageSize - ) throws IOException { - contentStream.drawImage(pdImage, 0, 0, pageSize.getWidth(), pageSize.getHeight()); - } -} diff --git a/src/main/java/com/mindee/pdf/SplitQuery.java b/src/main/java/com/mindee/pdf/SplitQuery.java deleted file mode 100644 index 4da810628..000000000 --- a/src/main/java/com/mindee/pdf/SplitQuery.java +++ /dev/null @@ -1,20 +0,0 @@ -package com.mindee.pdf; - -import com.mindee.input.PageOptions; -import lombok.Value; - -/** - * Represent parameter to split a PDF. - */ -@Value -public class SplitQuery { - - /** - * The file. - */ - byte[] file; - /** - * Represent options to cut a document. - */ - PageOptions pageOptions; -} diff --git a/src/main/java/com/mindee/v1/MindeeClient.java b/src/main/java/com/mindee/v1/MindeeClient.java index 7dc73d1a9..5f353414c 100644 --- a/src/main/java/com/mindee/v1/MindeeClient.java +++ b/src/main/java/com/mindee/v1/MindeeClient.java @@ -6,7 +6,6 @@ import com.mindee.input.PageOptions; import com.mindee.pdf.PDFBoxApi; import com.mindee.pdf.PDFOperation; -import com.mindee.pdf.SplitQuery; import com.mindee.v1.clientOptions.PollingOptions; import com.mindee.v1.clientOptions.PredictOptions; import com.mindee.v1.clientOptions.WorkflowOptions; @@ -132,9 +131,7 @@ protected byte[] getSplitFile( if (pageOptions == null || !localInputSource.isPdf()) { splitFile = localInputSource.getFile(); } else { - splitFile = pdfOperation - .split(new SplitQuery(localInputSource.getFile(), pageOptions)) - .getFile(); + splitFile = pdfOperation.split(localInputSource.getFile(), pageOptions).getFile(); } return splitFile; } diff --git a/src/test/java/com/mindee/image/ImageExtractorTest.java b/src/test/java/com/mindee/image/ImageExtractorTest.java index cff2fa79f..89db4cbaf 100644 --- a/src/test/java/com/mindee/image/ImageExtractorTest.java +++ b/src/test/java/com/mindee/image/ImageExtractorTest.java @@ -2,7 +2,6 @@ import static com.mindee.TestingUtilities.getResourcePath; import static com.mindee.TestingUtilities.getV1ResourcePath; -import static com.mindee.TestingUtilities.getV1ResourcePathString; import com.fasterxml.jackson.databind.JavaType; import com.fasterxml.jackson.databind.ObjectMapper; @@ -88,7 +87,7 @@ public void givenAnImage_shouldExtractValueFields() throws IOException { BarcodeReaderV1 inference = response.getDocument().getInference(); ImageExtractor extractor = new ImageExtractor( - getV1ResourcePathString("products/barcode_reader/default_sample.jpg") + new LocalInputSource(getV1ResourcePath("products/barcode_reader/default_sample.jpg")) ); Assertions.assertEquals(1, extractor.getPageCount()); diff --git a/src/test/java/com/mindee/pdf/PDFOperationTest.java b/src/test/java/com/mindee/pdf/PDFOperationTest.java index af6340c21..9cb08277c 100644 --- a/src/test/java/com/mindee/pdf/PDFOperationTest.java +++ b/src/test/java/com/mindee/pdf/PDFOperationTest.java @@ -3,12 +3,18 @@ import static com.mindee.TestingUtilities.getResourcePath; import com.mindee.MindeeException; +import com.mindee.input.LocalInputSource; import com.mindee.input.PageOptions; import com.mindee.input.PageOptionsOperation; +import java.io.File; import java.io.IOException; import java.nio.file.Files; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; +import java.util.Random; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -16,6 +22,55 @@ public class PDFOperationTest { private final PDFOperation pdfOperation = new PDFBoxApi(); + @Test + public void shouldConvertSinglePageToJpg() throws IOException { + LocalInputSource source = new LocalInputSource( + "src/test/resources/file_types/pdf/multipage.pdf" + ); + PdfPageImage pdfPageImage = pdfOperation.pdfPageToImage(source, 3); + Assertions.assertNotNull(pdfPageImage.getImage()); + Assertions.assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename()); + pdfPageImage.writeToFile("src/test/resources/output/"); + Assertions + .assertTrue( + Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())) + ); + } + + @Test + public void shouldConvertAllPagesToJpg() throws IOException { + LocalInputSource source = new LocalInputSource( + "src/test/resources/file_types/pdf/multipage.pdf" + ); + List pdfPageImages = pdfOperation.pdfToImages(source); + for (PdfPageImage pdfPageImage : pdfPageImages) { + Assertions.assertNotNull(pdfPageImage.getImage()); + Assertions + .assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename()); + pdfPageImage.writeToFile("src/test/resources/output/"); + Assertions + .assertTrue( + Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())) + ); + } + } + + @Test + public void givenADocument_whenPageCounted_thenReturnsCorrectPageCount() throws IOException { + PDDocument document = new PDDocument(); + int random = new Random().nextInt(30); + for (int i = 0; i < random; i++) { + PDPage page = new PDPage(); + document.addPage(page); + } + document.save("src/test/resources/output/test.pdf"); + document.close(); + File file = getResourcePath("output/test.pdf").toFile(); + LocalInputSource source = new LocalInputSource(file); + Assertions.assertEquals(random, pdfOperation.getNumberOfPages(source)); + file.delete(); + } + @Test public void givenADocumentAndPageToKeep_whenSplit_thenReturnsOnlyKeptPage() throws IOException { @@ -25,8 +80,7 @@ public void givenADocumentAndPageToKeep_whenSplit_thenReturnsOnlyKeptPage() thro .build(); byte[] fileBytes = Files.readAllBytes(getResourcePath("file_types/pdf/multipage.pdf")); - SplitQuery splitQuery = new SplitQuery(fileBytes, pageOptions); - SplitPDF splitPdf = pdfOperation.split(splitQuery); + SplitPDF splitPdf = pdfOperation.split(fileBytes, pageOptions); Assertions.assertNotNull(splitPdf); Assertions.assertNotNull(splitPdf.getFile()); @@ -45,11 +99,8 @@ public void givenADocumentAndListOfPagesToKeep_whenSplit_thenReturnsOnlyKeptPage .operation(PageOptionsOperation.KEEP_ONLY) .build(); - SplitQuery splitQuery = new SplitQuery( - Files.readAllBytes(getResourcePath("file_types/pdf/multipage.pdf")), - pageOptions - ); - SplitPDF splitPdf = pdfOperation.split(splitQuery); + SplitPDF splitPdf = pdfOperation + .split(Files.readAllBytes(getResourcePath("file_types/pdf/multipage.pdf")), pageOptions); Assertions.assertNotNull(splitPdf); Assertions.assertNotNull(splitPdf.getFile()); @@ -64,11 +115,8 @@ public void givenADocumentAndListOfPagesToRemove_whenSplit_thenReturnsOnlyNotRem .operation(PageOptionsOperation.REMOVE) .build(); - SplitQuery splitQuery = new SplitQuery( - Files.readAllBytes(getResourcePath("file_types/pdf/multipage.pdf")), - pageOptions - ); - SplitPDF splitPdf = pdfOperation.split(splitQuery); + SplitPDF splitPdf = pdfOperation + .split(Files.readAllBytes(getResourcePath("file_types/pdf/multipage.pdf")), pageOptions); Assertions.assertNotNull(splitPdf); Assertions.assertNotNull(splitPdf.getFile()); @@ -83,12 +131,12 @@ public void givenADocumentOtherThantAPdf_whenSplit_mustFail() throws IOException .operation(PageOptionsOperation.REMOVE) .build(); - SplitQuery splitQuery = new SplitQuery( - Files.readAllBytes(getResourcePath("file_types/receipt.jpg")), - pageOptions - ); - - Assertions.assertThrows(MindeeException.class, () -> pdfOperation.split(splitQuery)); + Assertions + .assertThrows( + MindeeException.class, + () -> pdfOperation + .split(Files.readAllBytes(getResourcePath("file_types/receipt.jpg")), pageOptions) + ); } @Test @@ -100,11 +148,11 @@ public void givenADocumentAndListPagesToRemoveAndMinPagesCondition_whenSplit_mus .onMinPages(5) .build(); - SplitQuery splitQuery = new SplitQuery( - Files.readAllBytes(getResourcePath("file_types/pdf/multipage_cut-2.pdf")), - pageOptions - ); - SplitPDF splitPdf = pdfOperation.split(splitQuery); + SplitPDF splitPdf = pdfOperation + .split( + Files.readAllBytes(getResourcePath("file_types/pdf/multipage_cut-2.pdf")), + pageOptions + ); Assertions.assertNotNull(splitPdf); Assertions.assertNotNull(splitPdf.getFile()); @@ -119,11 +167,8 @@ public void givenADocumentAndNegativeListPagesToKeep_whenSplit_thenReturnsOnlyKe .operation(PageOptionsOperation.KEEP_ONLY) .build(); - SplitQuery splitQuery = new SplitQuery( - Files.readAllBytes(getResourcePath("file_types/pdf/multipage.pdf")), - pageOptions - ); - SplitPDF splitPdf = pdfOperation.split(splitQuery); + SplitPDF splitPdf = pdfOperation + .split(Files.readAllBytes(getResourcePath("file_types/pdf/multipage.pdf")), pageOptions); Assertions.assertNotNull(splitPdf); Assertions.assertNotNull(splitPdf.getFile()); diff --git a/src/test/java/com/mindee/pdf/PDFUtilsTest.java b/src/test/java/com/mindee/pdf/PDFUtilsTest.java deleted file mode 100644 index cf78023bb..000000000 --- a/src/test/java/com/mindee/pdf/PDFUtilsTest.java +++ /dev/null @@ -1,105 +0,0 @@ -package com.mindee.pdf; - -import static com.mindee.TestingUtilities.getResourcePath; - -import com.mindee.input.LocalInputSource; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.nio.file.StandardCopyOption; -import java.util.Arrays; -import java.util.List; -import java.util.Random; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -public class PDFUtilsTest { - - @Test - public void givenADocument_whenPageCounted_thenReturnsCorrectPageCount() throws IOException { - PDDocument document = new PDDocument(); - int random = new Random().nextInt(30); - for (int i = 0; i < random; i++) { - PDPage page = new PDPage(); - document.addPage(page); - } - document.save("src/test/resources/output/test.pdf"); - document.close(); - File file = getResourcePath("output/test.pdf").toFile(); - LocalInputSource source = new LocalInputSource(file); - Assertions.assertEquals(random, PDFUtils.getNumberOfPages(source)); - file.delete(); - } - - @Test - public void givenADocumentAndListOfPages_whenMerged_thenReturnsCorrectDocument() throws IOException { - Path original = Paths.get("src/test/resources/file_types/pdf/multipage.pdf"); - Path copied = Paths.get("src/test/resources/output/fileToTest.pdf"); - Files.copy(original, copied, StandardCopyOption.REPLACE_EXISTING); - File file = getResourcePath("output/fileToTest.pdf").toFile(); - List pageList = Arrays.asList(0, 2, 3, 1, 10, 2, 1); - byte[] newPdf = PDFUtils.mergePdfPages(file, pageList); - PDDocument document = Loader.loadPDF(newPdf); - - Assertions.assertEquals(7, document.getNumberOfPages()); - document.close(); - file.delete(); - } - - @Test - public void givenANonEmptyDocument_whenEmptyChecked_shouldReturnFalse() throws IOException { - File pdfFile = getResourcePath("file_types/pdf/multipage.pdf").toFile(); - Assertions.assertFalse(PDFUtils.isPdfEmpty(pdfFile)); - } - - @Test - public void givenAnEmptyDocument_whenEmptyChecked_shouldReturnTrue() throws IOException { - PDDocument document = new PDDocument(); - int random = new Random().nextInt(30); - for (int i = 0; i < random; i++) { - PDPage page = new PDPage(); - document.addPage(page); - } - document.save("src/test/resources/output/test.pdf"); - document.close(); - File file = getResourcePath("output/test.pdf").toFile(); - Assertions.assertTrue(PDFUtils.isPdfEmpty(file)); - file.delete(); - } - - @Test - public void shouldConvertAllPagesToJpg() throws IOException { - List pdfPageImages = PDFUtils - .pdfToImages("src/test/resources/file_types/pdf/multipage_cut-2.pdf"); - for (PdfPageImage pdfPageImage : pdfPageImages) { - Assertions.assertNotNull(pdfPageImage.getImage()); - Assertions - .assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename()); - pdfPageImage.writeToFile("src/test/resources/output/"); - Assertions - .assertTrue( - Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())) - ); - } - } - - @Test - public void shouldConvertSinglePageToJpg() throws IOException { - LocalInputSource source = new LocalInputSource( - "src/test/resources/file_types/pdf/multipage.pdf" - ); - PdfPageImage pdfPageImage = PDFUtils.pdfPageToImage(source, 3); - Assertions.assertNotNull(pdfPageImage.getImage()); - Assertions.assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename()); - pdfPageImage.writeToFile("src/test/resources/output/"); - Assertions - .assertTrue( - Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())) - ); - } -}