use of difflib.InsertDelta in project pdfbox by apache.
the class TestTextStripper method doTestFile.
/**
* Validate text extraction on a single file.
*
* @param inFile The PDF file to validate
* @param outDir The directory to store the output in
* @param bLogResult Whether to log the extracted text
* @param bSort Whether or not the extracted text is sorted
* @throws Exception when there is an exception
*/
public void doTestFile(File inFile, File outDir, boolean bLogResult, boolean bSort) throws Exception {
if (bSort) {
log.info("Preparing to parse " + inFile.getName() + " for sorted test");
} else {
log.info("Preparing to parse " + inFile.getName() + " for standard test");
}
if (!outDir.exists()) {
if (!outDir.mkdirs()) {
throw (new Exception("Error creating " + outDir.getAbsolutePath() + " directory"));
}
}
// System.out.println(" " + inFile + (bSort ? " (sorted)" : ""));
try (PDDocument document = PDDocument.load(inFile)) {
File outFile;
File diffFile;
File expectedFile;
if (bSort) {
outFile = new File(outDir, inFile.getName() + "-sorted.txt");
diffFile = new File(outDir, inFile.getName() + "-sorted-diff.txt");
expectedFile = new File(inFile.getParentFile(), inFile.getName() + "-sorted.txt");
} else {
outFile = new File(outDir, inFile.getName() + ".txt");
diffFile = new File(outDir, inFile.getName() + "-diff.txt");
expectedFile = new File(inFile.getParentFile(), inFile.getName() + ".txt");
}
// delete possible leftover
diffFile.delete();
try (OutputStream os = new FileOutputStream(outFile)) {
os.write(0xEF);
os.write(0xBB);
os.write(0xBF);
try (Writer writer = new BufferedWriter(new OutputStreamWriter(os, ENCODING))) {
// Allows for sorted tests
stripper.setSortByPosition(bSort);
stripper.writeText(document, writer);
// close the written file before reading it again
}
}
if (bLogResult) {
log.info("Text for " + inFile.getName() + ":");
log.info(stripper.getText(document));
}
if (!expectedFile.exists()) {
this.bFail = true;
log.error("FAILURE: Input verification file: " + expectedFile.getAbsolutePath() + " did not exist");
return;
}
boolean localFail = false;
try (LineNumberReader expectedReader = new LineNumberReader(new InputStreamReader(new FileInputStream(expectedFile), ENCODING));
LineNumberReader actualReader = new LineNumberReader(new InputStreamReader(new FileInputStream(outFile), ENCODING))) {
while (true) {
String expectedLine = expectedReader.readLine();
while (expectedLine != null && expectedLine.trim().length() == 0) {
expectedLine = expectedReader.readLine();
}
String actualLine = actualReader.readLine();
while (actualLine != null && actualLine.trim().length() == 0) {
actualLine = actualReader.readLine();
}
if (!stringsEqual(expectedLine, actualLine)) {
this.bFail = true;
localFail = true;
log.error("FAILURE: Line mismatch for file " + inFile.getName() + " (sort = " + bSort + ")" + " at expected line: " + expectedReader.getLineNumber() + " at actual line: " + actualReader.getLineNumber() + "\nexpected line was: \"" + expectedLine + "\"" + "\nactual line was: \"" + actualLine + "\"" + "\n");
// lets report all lines, even though this might produce some verbose logging
// break;
}
if (expectedLine == null || actualLine == null) {
break;
}
}
}
if (!localFail) {
outFile.delete();
} else {
// https://code.google.com/p/java-diff-utils/wiki/SampleUsage
List<String> original = fileToLines(expectedFile);
List<String> revised = fileToLines(outFile);
// Compute diff. Get the Patch object. Patch is the container for computed deltas.
Patch patch = DiffUtils.diff(original, revised);
try (PrintStream diffPS = new PrintStream(diffFile, ENCODING)) {
for (Object delta : patch.getDeltas()) {
if (delta instanceof ChangeDelta) {
ChangeDelta cdelta = (ChangeDelta) delta;
diffPS.println("Org: " + cdelta.getOriginal());
diffPS.println("New: " + cdelta.getRevised());
diffPS.println();
} else if (delta instanceof DeleteDelta) {
DeleteDelta ddelta = (DeleteDelta) delta;
diffPS.println("Org: " + ddelta.getOriginal());
diffPS.println("New: " + ddelta.getRevised());
diffPS.println();
} else if (delta instanceof InsertDelta) {
InsertDelta idelta = (InsertDelta) delta;
diffPS.println("Org: " + idelta.getOriginal());
diffPS.println("New: " + idelta.getRevised());
diffPS.println();
} else {
diffPS.println(delta);
}
}
}
}
}
}
Aggregations