Search in sources :

Example 1 with DocumentAccessException

use of org.olat.search.service.document.file.DocumentAccessException in project OpenOLAT by OpenOLAT.

the class PdfBoxExtractor method extractTextFromPdf.

private FileContent extractTextFromPdf(VFSLeaf leaf) throws IOException, DocumentAccessException {
    if (log.isDebug())
        log.debug("readContent from pdf starts...");
    PDDocument document = null;
    BufferedInputStream bis = null;
    try {
        bis = new BufferedInputStream(leaf.getInputStream());
        document = PDDocument.load(bis);
        if (document.isEncrypted()) {
            try {
                document.decrypt("");
            } catch (Exception e) {
                log.warn("PDF is encrypted. Can not read content file=" + leaf.getName());
                LimitedContentWriter writer = new LimitedContentWriter(128, FileDocumentFactory.getMaxFileSize());
                writer.append(leaf.getName());
                writer.close();
                return new FileContent(leaf.getName(), writer.toString());
            }
        }
        String title = getTitle(document);
        if (log.isDebug())
            log.debug("readContent PDDocument loaded");
        PDFTextStripper stripper = new PDFTextStripper();
        LimitedContentWriter writer = new LimitedContentWriter(50000, FileDocumentFactory.getMaxFileSize());
        stripper.writeText(document, writer);
        writer.close();
        return new FileContent(title, writer.toString());
    } finally {
        if (document != null) {
            document.close();
        }
        if (bis != null) {
            bis.close();
        }
    }
}
Also used : LimitedContentWriter(org.olat.core.util.io.LimitedContentWriter) FileContent(org.olat.search.service.document.file.FileContent) BufferedInputStream(java.io.BufferedInputStream) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) IOException(java.io.IOException) DocumentAccessException(org.olat.search.service.document.file.DocumentAccessException) PDFTextStripper(org.apache.pdfbox.util.PDFTextStripper)

Example 2 with DocumentAccessException

use of org.olat.search.service.document.file.DocumentAccessException in project OpenOLAT by OpenOLAT.

the class FolderIndexerWorker method doIndexVFSLeaf.

protected void doIndexVFSLeaf(SearchResourceContext leafResourceContext, VFSLeaf leaf, OlatFullIndexer writer, String fPath) {
    if (log.isDebug())
        log.debug("Analyse VFSLeaf=" + leaf.getName());
    try {
        if (docFactory.isFileSupported(leaf)) {
            String myFilePath = fPath + "/" + leaf.getName();
            leafResourceContext.setFilePath(myFilePath);
            Document document = docFactory.createDocument(leafResourceContext, leaf);
            if (document != null) {
                // document which are disabled return null
                writer.addDocument(document);
            }
        } else {
            if (log.isDebug())
                log.debug("Documenttype not supported. file=" + leaf.getName());
        }
    } catch (DocumentAccessException e) {
        if (log.isDebug())
            log.debug("Can not access document." + e.getMessage());
    } catch (InterruptedException e) {
        if (log.isDebug())
            log.debug("InterruptedException: Can not index leaf=" + leaf.getName() + ";" + e.getMessage());
    } catch (IOException ioEx) {
        log.warn("IOException: Can not index leaf=" + leaf.getName(), ioEx);
    } catch (Exception ex) {
        log.warn("Exception: Can not index leaf=" + leaf.getName(), ex);
    }
}
Also used : IOException(java.io.IOException) Document(org.apache.lucene.document.Document) IOException(java.io.IOException) DocumentAccessException(org.olat.search.service.document.file.DocumentAccessException) DocumentAccessException(org.olat.search.service.document.file.DocumentAccessException)

Example 3 with DocumentAccessException

use of org.olat.search.service.document.file.DocumentAccessException in project OpenOLAT by OpenOLAT.

the class DialogCourseNodeIndexer method doIndexFile.

/**
 * Index a file of dialog-module.
 * @param filename
 * @param forumKey
 * @param leafResourceContext
 * @param indexWriter
 * @throws IOException
 * @throws InterruptedException
 */
private void doIndexFile(DialogElement element, SearchResourceContext leafResourceContext, OlatFullIndexer indexWriter) throws IOException, InterruptedException {
    DialogElementsManager dialogElmsMgr = CoreSpringFactory.getImpl(DialogElementsManager.class);
    VFSContainer dialogContainer = dialogElmsMgr.getDialogContainer(element);
    VFSLeaf leaf = (VFSLeaf) dialogContainer.getItems(new VFSLeafFilter()).get(0);
    if (isLogDebugEnabled())
        logDebug("Analyse VFSLeaf=" + leaf.getName());
    try {
        if (CoreSpringFactory.getImpl(FileDocumentFactory.class).isFileSupported(leaf)) {
            leafResourceContext.setFilePath(element.getFilename());
            leafResourceContext.setDocumentType(TYPE_FILE);
            Document document = CoreSpringFactory.getImpl(FileDocumentFactory.class).createDocument(leafResourceContext, leaf);
            indexWriter.addDocument(document);
        } else {
            if (isLogDebugEnabled())
                logDebug("Documenttype not supported. file=" + leaf.getName());
        }
    } catch (DocumentAccessException e) {
        if (isLogDebugEnabled())
            logDebug("Can not access document." + e.getMessage());
    } catch (IOException ioEx) {
        logWarn("IOException: Can not index leaf=" + leaf.getName(), ioEx);
    } catch (InterruptedException iex) {
        throw new InterruptedException(iex.getMessage());
    } catch (Exception ex) {
        logWarn("Exception: Can not index leaf=" + leaf.getName(), ex);
    }
}
Also used : VFSLeaf(org.olat.core.util.vfs.VFSLeaf) VFSContainer(org.olat.core.util.vfs.VFSContainer) IOException(java.io.IOException) DialogElementsManager(org.olat.course.nodes.dialog.DialogElementsManager) Document(org.apache.lucene.document.Document) ForumMessageDocument(org.olat.search.service.document.ForumMessageDocument) CourseNodeDocument(org.olat.search.service.document.CourseNodeDocument) FileDocumentFactory(org.olat.search.service.document.file.FileDocumentFactory) IOException(java.io.IOException) DocumentAccessException(org.olat.search.service.document.file.DocumentAccessException) VFSLeafFilter(org.olat.core.util.vfs.filters.VFSLeafFilter) DocumentAccessException(org.olat.search.service.document.file.DocumentAccessException)

Example 4 with DocumentAccessException

use of org.olat.search.service.document.file.DocumentAccessException in project openolat by klemens.

the class PdfBoxExtractor method extractTextFromPdf.

private FileContent extractTextFromPdf(VFSLeaf leaf) throws IOException, DocumentAccessException {
    if (log.isDebug())
        log.debug("readContent from pdf starts...");
    PDDocument document = null;
    BufferedInputStream bis = null;
    try {
        bis = new BufferedInputStream(leaf.getInputStream());
        document = PDDocument.load(bis);
        if (document.isEncrypted()) {
            try {
                document.decrypt("");
            } catch (Exception e) {
                log.warn("PDF is encrypted. Can not read content file=" + leaf.getName());
                LimitedContentWriter writer = new LimitedContentWriter(128, FileDocumentFactory.getMaxFileSize());
                writer.append(leaf.getName());
                writer.close();
                return new FileContent(leaf.getName(), writer.toString());
            }
        }
        String title = getTitle(document);
        if (log.isDebug())
            log.debug("readContent PDDocument loaded");
        PDFTextStripper stripper = new PDFTextStripper();
        LimitedContentWriter writer = new LimitedContentWriter(50000, FileDocumentFactory.getMaxFileSize());
        stripper.writeText(document, writer);
        writer.close();
        return new FileContent(title, writer.toString());
    } finally {
        if (document != null) {
            document.close();
        }
        if (bis != null) {
            bis.close();
        }
    }
}
Also used : LimitedContentWriter(org.olat.core.util.io.LimitedContentWriter) FileContent(org.olat.search.service.document.file.FileContent) BufferedInputStream(java.io.BufferedInputStream) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) IOException(java.io.IOException) DocumentAccessException(org.olat.search.service.document.file.DocumentAccessException) PDFTextStripper(org.apache.pdfbox.util.PDFTextStripper)

Example 5 with DocumentAccessException

use of org.olat.search.service.document.file.DocumentAccessException in project openolat by klemens.

the class DialogCourseNodeIndexer method doIndexFile.

/**
 * Index a file of dialog-module.
 * @param filename
 * @param forumKey
 * @param leafResourceContext
 * @param indexWriter
 * @throws IOException
 * @throws InterruptedException
 */
private void doIndexFile(DialogElement element, SearchResourceContext leafResourceContext, OlatFullIndexer indexWriter) throws IOException, InterruptedException {
    DialogElementsManager dialogElmsMgr = CoreSpringFactory.getImpl(DialogElementsManager.class);
    VFSContainer dialogContainer = dialogElmsMgr.getDialogContainer(element);
    VFSLeaf leaf = (VFSLeaf) dialogContainer.getItems(new VFSLeafFilter()).get(0);
    if (isLogDebugEnabled())
        logDebug("Analyse VFSLeaf=" + leaf.getName());
    try {
        if (CoreSpringFactory.getImpl(FileDocumentFactory.class).isFileSupported(leaf)) {
            leafResourceContext.setFilePath(element.getFilename());
            leafResourceContext.setDocumentType(TYPE_FILE);
            Document document = CoreSpringFactory.getImpl(FileDocumentFactory.class).createDocument(leafResourceContext, leaf);
            indexWriter.addDocument(document);
        } else {
            if (isLogDebugEnabled())
                logDebug("Documenttype not supported. file=" + leaf.getName());
        }
    } catch (DocumentAccessException e) {
        if (isLogDebugEnabled())
            logDebug("Can not access document." + e.getMessage());
    } catch (IOException ioEx) {
        logWarn("IOException: Can not index leaf=" + leaf.getName(), ioEx);
    } catch (InterruptedException iex) {
        throw new InterruptedException(iex.getMessage());
    } catch (Exception ex) {
        logWarn("Exception: Can not index leaf=" + leaf.getName(), ex);
    }
}
Also used : VFSLeaf(org.olat.core.util.vfs.VFSLeaf) VFSContainer(org.olat.core.util.vfs.VFSContainer) IOException(java.io.IOException) DialogElementsManager(org.olat.course.nodes.dialog.DialogElementsManager) Document(org.apache.lucene.document.Document) ForumMessageDocument(org.olat.search.service.document.ForumMessageDocument) CourseNodeDocument(org.olat.search.service.document.CourseNodeDocument) FileDocumentFactory(org.olat.search.service.document.file.FileDocumentFactory) IOException(java.io.IOException) DocumentAccessException(org.olat.search.service.document.file.DocumentAccessException) VFSLeafFilter(org.olat.core.util.vfs.filters.VFSLeafFilter) DocumentAccessException(org.olat.search.service.document.file.DocumentAccessException)

Aggregations

IOException (java.io.IOException)10 DocumentAccessException (org.olat.search.service.document.file.DocumentAccessException)10 Document (org.apache.lucene.document.Document)8 FileDocumentFactory (org.olat.search.service.document.file.FileDocumentFactory)6 VFSContainer (org.olat.core.util.vfs.VFSContainer)4 VFSLeaf (org.olat.core.util.vfs.VFSLeaf)4 BufferedInputStream (java.io.BufferedInputStream)2 IndexableField (org.apache.lucene.index.IndexableField)2 PDDocument (org.apache.pdfbox.pdmodel.PDDocument)2 PDFTextStripper (org.apache.pdfbox.util.PDFTextStripper)2 LimitedContentWriter (org.olat.core.util.io.LimitedContentWriter)2 VFSItem (org.olat.core.util.vfs.VFSItem)2 VFSLeafFilter (org.olat.core.util.vfs.filters.VFSLeafFilter)2 DialogElementsManager (org.olat.course.nodes.dialog.DialogElementsManager)2 AbstractOlatDocument (org.olat.search.model.AbstractOlatDocument)2 SearchResourceContext (org.olat.search.service.SearchResourceContext)2 CourseNodeDocument (org.olat.search.service.document.CourseNodeDocument)2 ForumMessageDocument (org.olat.search.service.document.ForumMessageDocument)2 FileContent (org.olat.search.service.document.file.FileContent)2