use of org.opencastproject.textextractor.api.TextExtractorException in project opencast by opencast.
the class TesseractTextExtractor method extract.
/**
* {@inheritDoc}
*
* @see org.opencastproject.textextractor.api.TextExtractor#extract(java.io.File)
*/
@Override
public TextFrame extract(File image) throws TextExtractorException {
if (binary == null)
throw new IllegalStateException("Binary is not set");
InputStream is = null;
File outputFile = null;
File outputFileBase = new File(image.getParentFile(), FilenameUtils.getBaseName(image.getName()));
// Run tesseract
String opts = getAnalysisOptions(image, outputFileBase);
logger.info("Running Tesseract: {} {}", binary, opts);
try {
final int exitCode = ProcessRunner.run(ProcessRunner.mk(binary, opts), fnLogDebug, new Pred<String>() {
@Override
public Boolean apply(String line) {
if (!line.trim().startsWith("Page") && !line.trim().startsWith("Tesseract Open Source OCR Engine")) {
logger.warn(line);
}
return true;
}
});
if (exitCode != 0) {
throw new TextExtractorException("Text analyzer " + binary + " exited with code " + exitCode);
}
// Read the tesseract output file
outputFile = new File(outputFileBase.getAbsolutePath() + ".txt");
is = new FileInputStream(outputFile);
TextFrame textFrame = TesseractTextFrame.parse(is);
is.close();
return textFrame;
} catch (IOException e) {
throw new TextExtractorException("Error running text extractor " + binary, e);
} finally {
IOUtils.closeQuietly(is);
FileUtils.deleteQuietly(outputFile);
}
}
use of org.opencastproject.textextractor.api.TextExtractorException in project opencast by opencast.
the class TextAnalyzerServiceImpl method analyze.
/**
* Returns the video text element for the given image.
*
* @param imageFile
* the image
* @param id
* the video text id
* @return the video text found on the image
* @throws TextAnalyzerException
* if accessing the image fails
*/
protected VideoText[] analyze(File imageFile, String id) throws TextAnalyzerException {
/* Call the text extractor implementation to extract the text from the
* provided image file */
List<VideoText> videoTexts = new ArrayList<VideoText>();
TextFrame textFrame = null;
try {
textFrame = textExtractor.extract(imageFile);
} catch (IOException e) {
logger.warn("Error reading image file {}: {}", imageFile, e.getMessage());
throw new TextAnalyzerException(e);
} catch (TextExtractorException e) {
logger.warn("Error extracting text from {}: {}", imageFile, e.getMessage());
throw new TextAnalyzerException(e);
}
/* Get detected text as raw string */
int i = 1;
for (TextLine line : textFrame.getLines()) {
if (line.getText() != null) {
VideoText videoText = new VideoTextImpl(id + "-" + i++);
videoText.setBoundary(line.getBoundaries());
Textual text = dictionaryService.cleanUpText(line.getText());
if (text != null) {
videoText.setText(text);
videoTexts.add(videoText);
}
}
}
return videoTexts.toArray(new VideoText[videoTexts.size()]);
}
Aggregations