use of org.olat.search.service.document.file.FileContent in project OpenOLAT by OpenOLAT.
the class PdfBoxExtractor method extractTextFromPdf.
private FileContent extractTextFromPdf(VFSLeaf leaf) throws IOException, DocumentAccessException {
if (log.isDebug())
log.debug("readContent from pdf starts...");
PDDocument document = null;
BufferedInputStream bis = null;
try {
bis = new BufferedInputStream(leaf.getInputStream());
document = PDDocument.load(bis);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (Exception e) {
log.warn("PDF is encrypted. Can not read content file=" + leaf.getName());
LimitedContentWriter writer = new LimitedContentWriter(128, FileDocumentFactory.getMaxFileSize());
writer.append(leaf.getName());
writer.close();
return new FileContent(leaf.getName(), writer.toString());
}
}
String title = getTitle(document);
if (log.isDebug())
log.debug("readContent PDDocument loaded");
PDFTextStripper stripper = new PDFTextStripper();
LimitedContentWriter writer = new LimitedContentWriter(50000, FileDocumentFactory.getMaxFileSize());
stripper.writeText(document, writer);
writer.close();
return new FileContent(title, writer.toString());
} finally {
if (document != null) {
document.close();
}
if (bis != null) {
bis.close();
}
}
}
use of org.olat.search.service.document.file.FileContent in project openolat by klemens.
the class PdfBoxExtractor method extract.
@Override
public void extract(VFSLeaf document, File bufferFile) throws IOException, DocumentAccessException {
FileContent content = extractTextFromPdf(document);
storePdfTextInBuffer(content, bufferFile);
}
use of org.olat.search.service.document.file.FileContent in project openolat by klemens.
the class PdfBoxExtractor method extractTextFromPdf.
private FileContent extractTextFromPdf(VFSLeaf leaf) throws IOException, DocumentAccessException {
if (log.isDebug())
log.debug("readContent from pdf starts...");
PDDocument document = null;
BufferedInputStream bis = null;
try {
bis = new BufferedInputStream(leaf.getInputStream());
document = PDDocument.load(bis);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (Exception e) {
log.warn("PDF is encrypted. Can not read content file=" + leaf.getName());
LimitedContentWriter writer = new LimitedContentWriter(128, FileDocumentFactory.getMaxFileSize());
writer.append(leaf.getName());
writer.close();
return new FileContent(leaf.getName(), writer.toString());
}
}
String title = getTitle(document);
if (log.isDebug())
log.debug("readContent PDDocument loaded");
PDFTextStripper stripper = new PDFTextStripper();
LimitedContentWriter writer = new LimitedContentWriter(50000, FileDocumentFactory.getMaxFileSize());
stripper.writeText(document, writer);
writer.close();
return new FileContent(title, writer.toString());
} finally {
if (document != null) {
document.close();
}
if (bis != null) {
bis.close();
}
}
}
use of org.olat.search.service.document.file.FileContent in project OpenOLAT by OpenOLAT.
the class PdfBoxExtractor method extract.
@Override
public void extract(VFSLeaf document, File bufferFile) throws IOException, DocumentAccessException {
FileContent content = extractTextFromPdf(document);
storePdfTextInBuffer(content, bufferFile);
}
Aggregations