use of org.apache.poi.hwpf.extractor.WordExtractor in project carbon-apimgt by wso2.
the class MSWordIndexer method getIndexedDocument.
public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException {
try {
String wordText = null;
try {
// Extract MSWord 2003 document files
POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data));
WordExtractor msWord2003Extractor = new WordExtractor(fs);
wordText = msWord2003Extractor.getText();
} catch (OfficeXmlFileException e) {
// if 2003 extraction failed, try with MSWord 2007 document files extractor
XWPFDocument doc = new XWPFDocument(new ByteArrayInputStream(fileData.data));
XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
wordText = msWord2007Extractor.getText();
} catch (Exception e) {
// The reason for not throwing an exception is that since this is an indexer that runs in the background
// throwing an exception might lead to adverse behaviors in the client side and might lead to
// other files not being indexed
String msg = "Failed to extract the document while indexing";
log.error(msg, e);
}
IndexDocument indexDoc = new IndexDocument(fileData.path, wordText, null);
Map<String, List<String>> fields = new HashMap<String, List<String>>();
fields.put("path", Collections.singletonList(fileData.path));
if (fileData.mediaType != null) {
fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Collections.singletonList(fileData.mediaType));
} else {
fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Collections.singletonList("application/pdf"));
}
indexDoc.setFields(fields);
return indexDoc;
} catch (IOException e) {
String msg = "Failed to write to the index";
log.error(msg, e);
throw new SolrException(ErrorCode.SERVER_ERROR, msg, e);
}
}
Aggregations