use of org.apache.poi.hssf.extractor.ExcelExtractor in project poi by apache.
the class TestHPSFPropertiesExtractor method testConstructors.
public void testConstructors() throws IOException {
POIFSFileSystem fs;
HSSFWorkbook wb;
try {
fs = new POIFSFileSystem(_samples.openResourceAsStream("TestUnicode.xls"));
wb = new HSSFWorkbook(fs);
} catch (IOException e) {
throw new RuntimeException(e);
}
ExcelExtractor excelExt = new ExcelExtractor(wb);
final String fsText;
HPSFPropertiesExtractor fsExt = new HPSFPropertiesExtractor(fs);
// Don't close re-used test resources!
fsExt.setFilesystem(null);
try {
fsText = fsExt.getText();
} finally {
fsExt.close();
}
final String hwText;
HPSFPropertiesExtractor hwExt = new HPSFPropertiesExtractor(wb);
// Don't close re-used test resources!
hwExt.setFilesystem(null);
try {
hwText = hwExt.getText();
} finally {
hwExt.close();
}
final String eeText;
HPSFPropertiesExtractor eeExt = new HPSFPropertiesExtractor(excelExt);
// Don't close re-used test resources!
eeExt.setFilesystem(null);
try {
eeText = eeExt.getText();
} finally {
eeExt.close();
wb.close();
}
assertEquals(fsText, hwText);
assertEquals(fsText, eeText);
assertContains(fsText, "AUTHOR = marshall");
assertContains(fsText, "TITLE = Titel: Äh");
}
use of org.apache.poi.hssf.extractor.ExcelExtractor in project poi by apache.
the class TestCryptoAPI method validateContent.
private void validateContent(String wbFile, String password, String textExpected) throws IOException {
Biff8EncryptionKey.setCurrentUserPassword(password);
HSSFWorkbook wb = ssTests.openSampleWorkbook(wbFile);
ExcelExtractor ee1 = new ExcelExtractor(wb);
String textActual = ee1.getText();
assertContains(textActual, textExpected);
Biff8EncryptionKey.setCurrentUserPassword("bla");
HSSFWorkbook wbBla = ssTests.writeOutAndReadBack(wb);
ExcelExtractor ee2 = new ExcelExtractor(wbBla);
textActual = ee2.getText();
assertContains(textActual, textExpected);
ee2.close();
ee1.close();
wbBla.close();
wb.close();
}
use of org.apache.poi.hssf.extractor.ExcelExtractor in project Gargoyle by callakrsos.
the class DocFileParser method DocFileContentParser.
public String DocFileContentParser(String fileName) {
POIFSFileSystem fs = null;
try {
fs = new POIFSFileSystem(new FileInputStream(fileName));
if (fileName.endsWith(".doc")) {
HWPFDocument doc = new HWPFDocument(fs);
WordExtractor we = new WordExtractor(doc);
return we.getText();
} else if (fileName.endsWith(".xls")) {
ExcelExtractor ex = new ExcelExtractor(fs);
ex.setFormulasNotResults(true);
ex.setIncludeSheetNames(true);
return ex.getText();
} else if (fileName.endsWith(".ppt")) {
PowerPointExtractor extractor = new PowerPointExtractor(fs);
return extractor.getText();
}
} catch (Exception e) {
LOGGER.debug("document file cant be indexed");
}
return "";
}
use of org.apache.poi.hssf.extractor.ExcelExtractor in project carbon-apimgt by wso2.
the class DocumentIndexer method fetchDocumentContent.
/**
* Write document content to document artifact as its raw content
*
* @param registry
* @param documentResource
* @return
* @throws RegistryException
* @throws IOException
* @throws APIManagementException
*/
private String fetchDocumentContent(Registry registry, Resource documentResource) throws RegistryException, IOException, APIManagementException {
GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry, APIConstants.DOCUMENTATION_KEY);
GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID());
String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE);
String contentString = null;
if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) {
String path = documentArtifact.getAttribute(APIConstants.DOC_FILE_PATH);
int indexOfApimgt = path.indexOf(APIConstants.APIMGT_REGISTRY_LOCATION);
String filepath = path.substring(indexOfApimgt);
Resource contentResource = registry.get(filepath);
int indexOfFiles = filepath.indexOf(APIConstants.DOCUMENT_FILE_DIR) + APIConstants.DOCUMENT_FILE_DIR.length() + 1;
String fileName = filepath.substring(indexOfFiles);
String extension = FilenameUtils.getExtension(fileName);
InputStream inputStream = null;
try {
inputStream = contentResource.getContentStream();
switch(extension) {
case APIConstants.PDF_EXTENSION:
PDFParser pdfParser = new PDFParser(new RandomAccessBufferedFileInputStream(inputStream));
pdfParser.parse();
COSDocument cosDocument = pdfParser.getDocument();
PDFTextStripper stripper = new PDFTextStripper();
contentString = stripper.getText(new PDDocument(cosDocument));
break;
case APIConstants.DOC_EXTENSION:
{
POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
WordExtractor msWord2003Extractor = new WordExtractor(pfs);
contentString = msWord2003Extractor.getText();
break;
}
case APIConstants.DOCX_EXTENSION:
XWPFDocument doc = new XWPFDocument(inputStream);
XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
contentString = msWord2007Extractor.getText();
break;
case APIConstants.XLS_EXTENSION:
{
POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
ExcelExtractor extractor = new ExcelExtractor(pfs);
contentString = extractor.getText();
break;
}
case APIConstants.XLSX_EXTENSION:
XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream);
XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets);
contentString = xssfExcelExtractor.getText();
break;
case APIConstants.PPT_EXTENSION:
{
POIFSFileSystem fs = new POIFSFileSystem(inputStream);
PowerPointExtractor extractor = new PowerPointExtractor(fs);
contentString = extractor.getText();
break;
}
case APIConstants.PPTX_EXTENSION:
XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow);
contentString = xslfPowerPointExtractor.getText();
break;
case APIConstants.TXT_EXTENSION:
case APIConstants.WSDL_EXTENSION:
case APIConstants.XML_DOC_EXTENSION:
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
String line;
StringBuilder contentBuilder = new StringBuilder();
while ((line = reader.readLine()) != null) {
contentBuilder.append(line);
}
contentString = contentBuilder.toString();
break;
}
} finally {
IOUtils.closeQuietly(inputStream);
}
} else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) {
String fileName = ((ResourceImpl) documentResource).getName();
String pathToDocFile = documentResource.getPath();
String pathToContent = pathToDocFile.substring(0, pathToDocFile.lastIndexOf(fileName)) + APIConstants.INLINE_DOCUMENT_CONTENT_DIR + RegistryConstants.PATH_SEPARATOR + fileName;
if (registry.resourceExists(pathToContent)) {
Resource contentResource = registry.get(pathToContent);
InputStream instream = null;
BufferedReader reader = null;
String line;
try {
instream = contentResource.getContentStream();
reader = new BufferedReader(new InputStreamReader(instream));
StringBuilder contentBuilder = new StringBuilder();
while ((line = reader.readLine()) != null) {
contentBuilder.append(line);
}
contentString = contentBuilder.toString();
} finally {
if (reader != null) {
IOUtils.closeQuietly(reader);
}
}
}
}
return contentString;
}
use of org.apache.poi.hssf.extractor.ExcelExtractor in project poi by apache.
the class OLE2ExtractorFactory method getEmbededDocsTextExtractors.
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
// All the embedded directories we spotted
List<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory
List<InputStream> nonPOIFS = new ArrayList<InputStream>();
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if (root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
if (ext instanceof ExcelExtractor) {
// These are in MBD... under the root
Iterator<Entry> it = root.getEntries();
while (it.hasNext()) {
Entry entry = it.next();
if (entry.getName().startsWith("MBD")) {
dirs.add(entry);
}
}
} else {
// Ask Scratchpad, or fail trying
Class<?> cls = getScratchpadClass();
try {
Method m = cls.getDeclaredMethod("identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
m.invoke(null, ext, dirs, nonPOIFS);
} catch (Exception e) {
throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
}
}
// Create the extractors
if (dirs.size() == 0 && nonPOIFS.size() == 0) {
return new POITextExtractor[0];
}
ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
for (Entry dir : dirs) {
e.add(createExtractor((DirectoryNode) dir));
}
for (InputStream nonPOIF : nonPOIFS) {
try {
e.add(createExtractor(nonPOIF));
} catch (IllegalArgumentException ie) {
// Ignore, just means it didn't contain
// a format we support as yet
LOGGER.log(POILogger.WARN, ie);
} catch (Exception xe) {
// Ignore, invalid format
LOGGER.log(POILogger.WARN, xe);
}
}
return e.toArray(new POITextExtractor[e.size()]);
}
Aggregations