Search in sources :

Example 6 with LimitedContentWriter

use of org.olat.core.util.io.LimitedContentWriter in project OpenOLAT by OpenOLAT.

the class WordOOXMLDocument method readContent.

@Override
public FileContent readContent(VFSLeaf leaf) throws IOException, DocumentException {
    File file = ((JavaIOItem) leaf).getBasefile();
    LimitedContentWriter writer = new LimitedContentWriter(100000, FileDocumentFactory.getMaxFileSize());
    try (ZipFile wordFile = new ZipFile(file)) {
        List<String> contents = new ArrayList<>();
        for (Enumeration<? extends ZipEntry> entriesEnumeration = wordFile.entries(); entriesEnumeration.hasMoreElements(); ) {
            ZipEntry entry = entriesEnumeration.nextElement();
            String name = entry.getName();
            if (name.endsWith("word/document.xml")) {
                contents.add(name);
            } else if (name.startsWith(HEADER) && name.endsWith(".xml")) {
                contents.add(name);
            } else if (name.startsWith(FOOTER) && name.endsWith(".xml")) {
                contents.add(name);
            }
        }
        if (contents.size() > 1) {
            Collections.sort(contents, new WordDocumentComparator());
        }
        for (String content : contents) {
            if (writer.accept()) {
                ZipEntry entry = wordFile.getEntry(content);
                InputStream zip = wordFile.getInputStream(entry);
                OfficeDocumentHandler dh = new OfficeDocumentHandler(writer);
                parse(new ShieldInputStream(zip), dh);
                zip.close();
            }
        }
    } catch (DocumentException e) {
        throw e;
    } catch (Exception e) {
        throw new DocumentException(e.getMessage());
    }
    return new FileContent(writer.toString());
}
Also used : JavaIOItem(org.olat.core.util.vfs.JavaIOItem) ShieldInputStream(org.olat.core.util.io.ShieldInputStream) InputStream(java.io.InputStream) ZipEntry(java.util.zip.ZipEntry) ArrayList(java.util.ArrayList) ShieldInputStream(org.olat.core.util.io.ShieldInputStream) IOException(java.io.IOException) LimitedContentWriter(org.olat.core.util.io.LimitedContentWriter) ZipFile(java.util.zip.ZipFile) File(java.io.File) ZipFile(java.util.zip.ZipFile)

Example 7 with LimitedContentWriter

use of org.olat.core.util.io.LimitedContentWriter in project OpenOLAT by OpenOLAT.

the class PdfBoxExtractor method extractTextFromPdf.

private FileContent extractTextFromPdf(VFSLeaf leaf) throws IOException, DocumentAccessException {
    if (log.isDebug())
        log.debug("readContent from pdf starts...");
    PDDocument document = null;
    BufferedInputStream bis = null;
    try {
        bis = new BufferedInputStream(leaf.getInputStream());
        document = PDDocument.load(bis);
        if (document.isEncrypted()) {
            try {
                document.decrypt("");
            } catch (Exception e) {
                log.warn("PDF is encrypted. Can not read content file=" + leaf.getName());
                LimitedContentWriter writer = new LimitedContentWriter(128, FileDocumentFactory.getMaxFileSize());
                writer.append(leaf.getName());
                writer.close();
                return new FileContent(leaf.getName(), writer.toString());
            }
        }
        String title = getTitle(document);
        if (log.isDebug())
            log.debug("readContent PDDocument loaded");
        PDFTextStripper stripper = new PDFTextStripper();
        LimitedContentWriter writer = new LimitedContentWriter(50000, FileDocumentFactory.getMaxFileSize());
        stripper.writeText(document, writer);
        writer.close();
        return new FileContent(title, writer.toString());
    } finally {
        if (document != null) {
            document.close();
        }
        if (bis != null) {
            bis.close();
        }
    }
}
Also used : LimitedContentWriter(org.olat.core.util.io.LimitedContentWriter) FileContent(org.olat.search.service.document.file.FileContent) BufferedInputStream(java.io.BufferedInputStream) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) IOException(java.io.IOException) DocumentAccessException(org.olat.search.service.document.file.DocumentAccessException) PDFTextStripper(org.apache.pdfbox.util.PDFTextStripper)

Example 8 with LimitedContentWriter

use of org.olat.core.util.io.LimitedContentWriter in project OpenOLAT by OpenOLAT.

the class ExcelDocument method readContent.

@Override
protected FileContent readContent(VFSLeaf leaf) throws IOException, DocumentException {
    int cellNullCounter = 0;
    int rowNullCounter = 0;
    int sheetNullCounter = 0;
    try (BufferedInputStream bis = new BufferedInputStream(leaf.getInputStream());
        HSSFWorkbook workbook = new HSSFWorkbook(new POIFSFileSystem(bis))) {
        LimitedContentWriter content = new LimitedContentWriter((int) leaf.getSize(), FileDocumentFactory.getMaxFileSize());
        for (int sheetNumber = 0; sheetNumber < workbook.getNumberOfSheets(); sheetNumber++) {
            HSSFSheet sheet = workbook.getSheetAt(sheetNumber);
            if (sheet != null) {
                for (int rowNumber = sheet.getFirstRowNum(); rowNumber <= sheet.getLastRowNum(); rowNumber++) {
                    HSSFRow row = sheet.getRow(rowNumber);
                    if (row != null) {
                        for (int cellNumber = row.getFirstCellNum(); cellNumber <= row.getLastCellNum(); cellNumber++) {
                            HSSFCell cell = row.getCell(cellNumber);
                            if (cell != null) {
                                if (cell.getCellTypeEnum() == CellType.STRING) {
                                    content.append(cell.getStringCellValue()).append(' ');
                                }
                            } else {
                                cellNullCounter++;
                            }
                        }
                    } else {
                        rowNullCounter++;
                    }
                }
            } else {
                sheetNullCounter++;
            }
        }
        if (log.isDebug()) {
            if ((cellNullCounter > 0) || (rowNullCounter > 0) || (sheetNullCounter > 0)) {
                log.debug("Read Excel content cell=null #:" + cellNullCounter + ", row=null #:" + rowNullCounter + ", sheet=null #:" + sheetNullCounter);
            }
        }
        content.close();
        return new FileContent(content.toString());
    } catch (Exception ex) {
        throw new DocumentException("Can not read XLS Content. File=" + leaf.getName(), ex);
    }
}
Also used : LimitedContentWriter(org.olat.core.util.io.LimitedContentWriter) BufferedInputStream(java.io.BufferedInputStream) POIFSFileSystem(org.apache.poi.poifs.filesystem.POIFSFileSystem) HSSFCell(org.apache.poi.hssf.usermodel.HSSFCell) HSSFRow(org.apache.poi.hssf.usermodel.HSSFRow) HSSFSheet(org.apache.poi.hssf.usermodel.HSSFSheet) HSSFWorkbook(org.apache.poi.hssf.usermodel.HSSFWorkbook) IOException(java.io.IOException)

Example 9 with LimitedContentWriter

use of org.olat.core.util.io.LimitedContentWriter in project openolat by klemens.

the class ExcelDocument method readContent.

@Override
protected FileContent readContent(VFSLeaf leaf) throws IOException, DocumentException {
    int cellNullCounter = 0;
    int rowNullCounter = 0;
    int sheetNullCounter = 0;
    try (BufferedInputStream bis = new BufferedInputStream(leaf.getInputStream());
        HSSFWorkbook workbook = new HSSFWorkbook(new POIFSFileSystem(bis))) {
        LimitedContentWriter content = new LimitedContentWriter((int) leaf.getSize(), FileDocumentFactory.getMaxFileSize());
        for (int sheetNumber = 0; sheetNumber < workbook.getNumberOfSheets(); sheetNumber++) {
            HSSFSheet sheet = workbook.getSheetAt(sheetNumber);
            if (sheet != null) {
                for (int rowNumber = sheet.getFirstRowNum(); rowNumber <= sheet.getLastRowNum(); rowNumber++) {
                    HSSFRow row = sheet.getRow(rowNumber);
                    if (row != null) {
                        for (int cellNumber = row.getFirstCellNum(); cellNumber <= row.getLastCellNum(); cellNumber++) {
                            HSSFCell cell = row.getCell(cellNumber);
                            if (cell != null) {
                                if (cell.getCellTypeEnum() == CellType.STRING) {
                                    content.append(cell.getStringCellValue()).append(' ');
                                }
                            } else {
                                cellNullCounter++;
                            }
                        }
                    } else {
                        rowNullCounter++;
                    }
                }
            } else {
                sheetNullCounter++;
            }
        }
        if (log.isDebug()) {
            if ((cellNullCounter > 0) || (rowNullCounter > 0) || (sheetNullCounter > 0)) {
                log.debug("Read Excel content cell=null #:" + cellNullCounter + ", row=null #:" + rowNullCounter + ", sheet=null #:" + sheetNullCounter);
            }
        }
        content.close();
        return new FileContent(content.toString());
    } catch (Exception ex) {
        throw new DocumentException("Can not read XLS Content. File=" + leaf.getName(), ex);
    }
}
Also used : LimitedContentWriter(org.olat.core.util.io.LimitedContentWriter) BufferedInputStream(java.io.BufferedInputStream) POIFSFileSystem(org.apache.poi.poifs.filesystem.POIFSFileSystem) HSSFCell(org.apache.poi.hssf.usermodel.HSSFCell) HSSFRow(org.apache.poi.hssf.usermodel.HSSFRow) HSSFSheet(org.apache.poi.hssf.usermodel.HSSFSheet) HSSFWorkbook(org.apache.poi.hssf.usermodel.HSSFWorkbook) IOException(java.io.IOException)

Example 10 with LimitedContentWriter

use of org.olat.core.util.io.LimitedContentWriter in project openolat by klemens.

the class PdfDocument method getPdfTextFromBuffer.

private FileContent getPdfTextFromBuffer(File pdfTextFile) throws IOException {
    if (log.isDebug())
        log.debug("readContent from text file start...");
    try (BufferedReader br = new BufferedReader(new FileReader(pdfTextFile));
        LimitedContentWriter sb = new LimitedContentWriter(5000, FileDocumentFactory.getMaxFileSize())) {
        // search the title
        char[] cbuf = new char[4096];
        int length = br.read(cbuf);
        int indexSep = 0;
        String title = "";
        if (length > 0) {
            String firstChunk = new String(cbuf, 0, length);
            indexSep = firstChunk.indexOf("\u00A0|\u00A0");
            if (indexSep > 0) {
                title = firstChunk.substring(0, indexSep);
                sb.append(firstChunk.substring(indexSep + 3));
            } else {
                sb.append(firstChunk);
            }
            while ((length = br.read(cbuf)) > 0) {
                sb.write(cbuf, 0, length);
            }
        }
        return new FileContent(title, sb.toString());
    } catch (IOException e) {
        throw e;
    }
}
Also used : LimitedContentWriter(org.olat.core.util.io.LimitedContentWriter) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) IOException(java.io.IOException)

Aggregations

IOException (java.io.IOException)18 LimitedContentWriter (org.olat.core.util.io.LimitedContentWriter)18 BufferedInputStream (java.io.BufferedInputStream)8 InputStream (java.io.InputStream)8 ZipEntry (java.util.zip.ZipEntry)6 ShieldInputStream (org.olat.core.util.io.ShieldInputStream)6 File (java.io.File)4 ArrayList (java.util.ArrayList)4 ZipFile (java.util.zip.ZipFile)4 POIFSFileSystem (org.apache.poi.poifs.filesystem.POIFSFileSystem)4 JavaIOItem (org.olat.core.util.vfs.JavaIOItem)4 BufferedReader (java.io.BufferedReader)2 FileReader (java.io.FileReader)2 InputStreamReader (java.io.InputStreamReader)2 ZipInputStream (java.util.zip.ZipInputStream)2 PDDocument (org.apache.pdfbox.pdmodel.PDDocument)2 PDFTextStripper (org.apache.pdfbox.util.PDFTextStripper)2 HSSFCell (org.apache.poi.hssf.usermodel.HSSFCell)2 HSSFRow (org.apache.poi.hssf.usermodel.HSSFRow)2 HSSFSheet (org.apache.poi.hssf.usermodel.HSSFSheet)2