Search in sources :

Example 1 with TextExtractorException

use of org.opencastproject.textextractor.api.TextExtractorException in project opencast by opencast.

the class TesseractTextExtractor method extract.

/**
 * {@inheritDoc}
 *
 * @see org.opencastproject.textextractor.api.TextExtractor#extract(java.io.File)
 */
@Override
public TextFrame extract(File image) throws TextExtractorException {
    if (binary == null)
        throw new IllegalStateException("Binary is not set");
    InputStream is = null;
    File outputFile = null;
    File outputFileBase = new File(image.getParentFile(), FilenameUtils.getBaseName(image.getName()));
    // Run tesseract
    String opts = getAnalysisOptions(image, outputFileBase);
    logger.info("Running Tesseract: {} {}", binary, opts);
    try {
        final int exitCode = ProcessRunner.run(ProcessRunner.mk(binary, opts), fnLogDebug, new Pred<String>() {

            @Override
            public Boolean apply(String line) {
                if (!line.trim().startsWith("Page") && !line.trim().startsWith("Tesseract Open Source OCR Engine")) {
                    logger.warn(line);
                }
                return true;
            }
        });
        if (exitCode != 0) {
            throw new TextExtractorException("Text analyzer " + binary + " exited with code " + exitCode);
        }
        // Read the tesseract output file
        outputFile = new File(outputFileBase.getAbsolutePath() + ".txt");
        is = new FileInputStream(outputFile);
        TextFrame textFrame = TesseractTextFrame.parse(is);
        is.close();
        return textFrame;
    } catch (IOException e) {
        throw new TextExtractorException("Error running text extractor " + binary, e);
    } finally {
        IOUtils.closeQuietly(is);
        FileUtils.deleteQuietly(outputFile);
    }
}
Also used : FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) TextExtractorException(org.opencastproject.textextractor.api.TextExtractorException) TextFrame(org.opencastproject.textextractor.api.TextFrame) IOException(java.io.IOException) File(java.io.File) FileInputStream(java.io.FileInputStream)

Example 2 with TextExtractorException

use of org.opencastproject.textextractor.api.TextExtractorException in project opencast by opencast.

the class TextAnalyzerServiceImpl method analyze.

/**
 * Returns the video text element for the given image.
 *
 * @param imageFile
 *          the image
 * @param id
 *          the video text id
 * @return the video text found on the image
 * @throws TextAnalyzerException
 *           if accessing the image fails
 */
protected VideoText[] analyze(File imageFile, String id) throws TextAnalyzerException {
    /* Call the text extractor implementation to extract the text from the
     * provided image file */
    List<VideoText> videoTexts = new ArrayList<VideoText>();
    TextFrame textFrame = null;
    try {
        textFrame = textExtractor.extract(imageFile);
    } catch (IOException e) {
        logger.warn("Error reading image file {}: {}", imageFile, e.getMessage());
        throw new TextAnalyzerException(e);
    } catch (TextExtractorException e) {
        logger.warn("Error extracting text from {}: {}", imageFile, e.getMessage());
        throw new TextAnalyzerException(e);
    }
    /* Get detected text as raw string */
    int i = 1;
    for (TextLine line : textFrame.getLines()) {
        if (line.getText() != null) {
            VideoText videoText = new VideoTextImpl(id + "-" + i++);
            videoText.setBoundary(line.getBoundaries());
            Textual text = dictionaryService.cleanUpText(line.getText());
            if (text != null) {
                videoText.setText(text);
                videoTexts.add(videoText);
            }
        }
    }
    return videoTexts.toArray(new VideoText[videoTexts.size()]);
}
Also used : TextAnalyzerException(org.opencastproject.textanalyzer.api.TextAnalyzerException) TextLine(org.opencastproject.textextractor.api.TextLine) Textual(org.opencastproject.metadata.mpeg7.Textual) TextExtractorException(org.opencastproject.textextractor.api.TextExtractorException) VideoTextImpl(org.opencastproject.metadata.mpeg7.VideoTextImpl) ArrayList(java.util.ArrayList) TextFrame(org.opencastproject.textextractor.api.TextFrame) IOException(java.io.IOException) VideoText(org.opencastproject.metadata.mpeg7.VideoText)

Aggregations

IOException (java.io.IOException)2 TextExtractorException (org.opencastproject.textextractor.api.TextExtractorException)2 TextFrame (org.opencastproject.textextractor.api.TextFrame)2 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 InputStream (java.io.InputStream)1 ArrayList (java.util.ArrayList)1 Textual (org.opencastproject.metadata.mpeg7.Textual)1 VideoText (org.opencastproject.metadata.mpeg7.VideoText)1 VideoTextImpl (org.opencastproject.metadata.mpeg7.VideoTextImpl)1 TextAnalyzerException (org.opencastproject.textanalyzer.api.TextAnalyzerException)1 TextLine (org.opencastproject.textextractor.api.TextLine)1