use of org.apache.poi.hssf.extractor.ExcelExtractor in project poi by apache.
the class TestExtractorFactory method testFile.
@Test
public void testFile() throws Exception {
// Excel
POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
assertNotNull("Had empty extractor for " + xls, xlsExtractor);
assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(), xlsExtractor instanceof ExcelExtractor);
assertTrue(xlsExtractor.getText().length() > 200);
xlsExtractor.close();
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(extractor.getText().length() > 200);
extractor.close();
extractor = ExtractorFactory.createExtractor(xltx);
assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(xlsb);
assertContains(extractor.getText(), "test");
extractor.close();
extractor = ExtractorFactory.createExtractor(xltx);
assertContains(extractor.getText(), "test");
extractor.close();
// TODO Support OOXML-Strict, see bug #57699
try {
/*extractor =*/
ExtractorFactory.createExtractor(xlsxStrict);
fail("OOXML-Strict isn't yet supported");
} catch (POIXMLException e) {
// Expected, for now
}
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
// assertTrue(
// extractor
// instanceof XSSFExcelExtractor
// );
// extractor.close();
//
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
// assertTrue(
// extractor.getText().contains("test")
// );
// extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(doc);
assertTrue(extractor instanceof WordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc6);
assertTrue(extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc95);
assertTrue(extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(extractor instanceof XWPFWordExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(dotx);
assertTrue(extractor instanceof XWPFWordExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(dotx);
assertContains(extractor.getText(), "Test");
extractor.close();
// PowerPoint (PPT)
extractor = ExtractorFactory.createExtractor(ppt);
assertTrue(extractor instanceof PowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint (PPTX)
extractor = ExtractorFactory.createExtractor(pptx);
assertTrue(extractor instanceof XSLFPowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio - binary
extractor = ExtractorFactory.createExtractor(vsd);
assertTrue(extractor instanceof VisioTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(vsdx);
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(pub);
assertTrue(extractor instanceof PublisherTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(msg);
assertTrue(extractor instanceof OutlookTextExtactor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Text
try {
ExtractorFactory.createExtractor(txt);
fail();
} catch (IllegalArgumentException e) {
// Good
}
}
use of org.apache.poi.hssf.extractor.ExcelExtractor in project carbon-apimgt by wso2.
the class MSExcelIndexer method getIndexedDocument.
public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException {
try {
String excelText = null;
try {
// Extract Excel 2003 (.xsl) document files
ExcelExtractor extractor = getExcelExtractor(fileData);
excelText = extractor.getText();
} catch (OfficeXmlFileException e) {
// if 2003 Excel (.xsl) extraction failed, try with Excel 2007 (.xslx) document files extractor
XSSFExcelExtractor xssfExcelExtractor = getXssfExcelExtractor(fileData);
excelText = xssfExcelExtractor.getText();
} catch (Exception e) {
String msg = "Failed to extract the document";
log.error(msg, e);
}
IndexDocument indexDoc = new IndexDocument(fileData.path, excelText, null);
Map<String, List<String>> fields = new HashMap<String, List<String>>();
fields.put("path", Collections.singletonList(fileData.path));
if (fileData.mediaType != null) {
fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Collections.singletonList(fileData.mediaType));
} else {
fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Collections.singletonList("application/vnd.ms-excel"));
}
indexDoc.setFields(fields);
return indexDoc;
} catch (IOException e) {
String msg = "Failed to write to the index";
log.error(msg, e);
throw new SolrException(ErrorCode.SERVER_ERROR, msg, e);
}
}
use of org.apache.poi.hssf.extractor.ExcelExtractor in project carbon-apimgt by wso2.
the class MSExcelIndexerTest method setup.
@Before
public void setup() {
excelExtractor = Mockito.mock(ExcelExtractor.class);
xssfExtractor = Mockito.mock(XSSFExcelExtractor.class);
msExcelIndexer = new MSExcelIndexerWrapper(xssfExtractor, excelExtractor);
file2Index = new AsyncIndexer.File2Index("".getBytes(), "", "", -1234, "");
}
Aggregations