Search in sources :

Example 1 with InsertDelta

use of difflib.InsertDelta in project pdfbox by apache.

the class TestTextStripper method doTestFile.

/**
 * Validate text extraction on a single file.
 *
 * @param inFile The PDF file to validate
 * @param outDir The directory to store the output in
 * @param bLogResult Whether to log the extracted text
 * @param bSort Whether or not the extracted text is sorted
 * @throws Exception when there is an exception
 */
public void doTestFile(File inFile, File outDir, boolean bLogResult, boolean bSort) throws Exception {
    if (bSort) {
        log.info("Preparing to parse " + inFile.getName() + " for sorted test");
    } else {
        log.info("Preparing to parse " + inFile.getName() + " for standard test");
    }
    if (!outDir.exists()) {
        if (!outDir.mkdirs()) {
            throw (new Exception("Error creating " + outDir.getAbsolutePath() + " directory"));
        }
    }
    // System.out.println("  " + inFile + (bSort ? " (sorted)" : ""));
    try (PDDocument document = PDDocument.load(inFile)) {
        File outFile;
        File diffFile;
        File expectedFile;
        if (bSort) {
            outFile = new File(outDir, inFile.getName() + "-sorted.txt");
            diffFile = new File(outDir, inFile.getName() + "-sorted-diff.txt");
            expectedFile = new File(inFile.getParentFile(), inFile.getName() + "-sorted.txt");
        } else {
            outFile = new File(outDir, inFile.getName() + ".txt");
            diffFile = new File(outDir, inFile.getName() + "-diff.txt");
            expectedFile = new File(inFile.getParentFile(), inFile.getName() + ".txt");
        }
        // delete possible leftover
        diffFile.delete();
        try (OutputStream os = new FileOutputStream(outFile)) {
            os.write(0xEF);
            os.write(0xBB);
            os.write(0xBF);
            try (Writer writer = new BufferedWriter(new OutputStreamWriter(os, ENCODING))) {
                // Allows for sorted tests
                stripper.setSortByPosition(bSort);
                stripper.writeText(document, writer);
            // close the written file before reading it again
            }
        }
        if (bLogResult) {
            log.info("Text for " + inFile.getName() + ":");
            log.info(stripper.getText(document));
        }
        if (!expectedFile.exists()) {
            this.bFail = true;
            log.error("FAILURE: Input verification file: " + expectedFile.getAbsolutePath() + " did not exist");
            return;
        }
        boolean localFail = false;
        try (LineNumberReader expectedReader = new LineNumberReader(new InputStreamReader(new FileInputStream(expectedFile), ENCODING));
            LineNumberReader actualReader = new LineNumberReader(new InputStreamReader(new FileInputStream(outFile), ENCODING))) {
            while (true) {
                String expectedLine = expectedReader.readLine();
                while (expectedLine != null && expectedLine.trim().length() == 0) {
                    expectedLine = expectedReader.readLine();
                }
                String actualLine = actualReader.readLine();
                while (actualLine != null && actualLine.trim().length() == 0) {
                    actualLine = actualReader.readLine();
                }
                if (!stringsEqual(expectedLine, actualLine)) {
                    this.bFail = true;
                    localFail = true;
                    log.error("FAILURE: Line mismatch for file " + inFile.getName() + " (sort = " + bSort + ")" + " at expected line: " + expectedReader.getLineNumber() + " at actual line: " + actualReader.getLineNumber() + "\nexpected line was: \"" + expectedLine + "\"" + "\nactual line was:   \"" + actualLine + "\"" + "\n");
                // lets report all lines, even though this might produce some verbose logging
                // break;
                }
                if (expectedLine == null || actualLine == null) {
                    break;
                }
            }
        }
        if (!localFail) {
            outFile.delete();
        } else {
            // https://code.google.com/p/java-diff-utils/wiki/SampleUsage
            List<String> original = fileToLines(expectedFile);
            List<String> revised = fileToLines(outFile);
            // Compute diff. Get the Patch object. Patch is the container for computed deltas.
            Patch patch = DiffUtils.diff(original, revised);
            try (PrintStream diffPS = new PrintStream(diffFile, ENCODING)) {
                for (Object delta : patch.getDeltas()) {
                    if (delta instanceof ChangeDelta) {
                        ChangeDelta cdelta = (ChangeDelta) delta;
                        diffPS.println("Org: " + cdelta.getOriginal());
                        diffPS.println("New: " + cdelta.getRevised());
                        diffPS.println();
                    } else if (delta instanceof DeleteDelta) {
                        DeleteDelta ddelta = (DeleteDelta) delta;
                        diffPS.println("Org: " + ddelta.getOriginal());
                        diffPS.println("New: " + ddelta.getRevised());
                        diffPS.println();
                    } else if (delta instanceof InsertDelta) {
                        InsertDelta idelta = (InsertDelta) delta;
                        diffPS.println("Org: " + idelta.getOriginal());
                        diffPS.println("New: " + idelta.getRevised());
                        diffPS.println();
                    } else {
                        diffPS.println(delta);
                    }
                }
            }
        }
    }
}
Also used : PrintStream(java.io.PrintStream) ChangeDelta(difflib.ChangeDelta) InputStreamReader(java.io.InputStreamReader) DeleteDelta(difflib.DeleteDelta) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) URISyntaxException(java.net.URISyntaxException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) BufferedWriter(java.io.BufferedWriter) LineNumberReader(java.io.LineNumberReader) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) Patch(difflib.Patch) OutputStreamWriter(java.io.OutputStreamWriter) BufferedWriter(java.io.BufferedWriter) Writer(java.io.Writer) InsertDelta(difflib.InsertDelta)

Aggregations

ChangeDelta (difflib.ChangeDelta)1 DeleteDelta (difflib.DeleteDelta)1 InsertDelta (difflib.InsertDelta)1 Patch (difflib.Patch)1 BufferedWriter (java.io.BufferedWriter)1 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 InputStreamReader (java.io.InputStreamReader)1 LineNumberReader (java.io.LineNumberReader)1 OutputStream (java.io.OutputStream)1 OutputStreamWriter (java.io.OutputStreamWriter)1 PrintStream (java.io.PrintStream)1 Writer (java.io.Writer)1 URISyntaxException (java.net.URISyntaxException)1 PDDocument (org.apache.pdfbox.pdmodel.PDDocument)1