use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project poi by apache.
the class TestExtractorFactory method testPackage.
@Test
public void testPackage() throws Exception {
// Excel
POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
assertTrue(extractor.getText().length() > 200);
extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
assertTrue(extractor instanceof XWPFWordExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
assertTrue(extractor instanceof XSLFPowerPointExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio
extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Text
try {
ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
fail("TestExtractorFactory.testPackage() failed on " + txt);
} catch (UnsupportedFileFormatException e) {
// Good
} catch (Exception e) {
System.out.println("TestExtractorFactory.testPackage() failed on " + txt);
throw e;
}
}
use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project poi by apache.
the class TestExtractorFactory method testInputStream.
@Test
public void testInputStream() throws Exception {
// Excel
POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
assertTrue(extractor instanceof ExcelExtractor);
assertTrue(extractor.getText().length() > 200);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
assertTrue(extractor.getText().length() > 200);
// TODO Support OOXML-Strict, see bug #57699
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
// instanceof XSSFExcelExtractor
// );
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
// );
extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
assertTrue(extractor.getClass().getName(), extractor instanceof WordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
assertTrue(extractor instanceof XWPFWordExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint
extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
assertTrue(extractor instanceof PowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
assertTrue(extractor instanceof XSLFPowerPointExtractor);
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
assertTrue(extractor instanceof VisioTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
assertTrue(extractor instanceof PublisherTextExtractor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
assertTrue(extractor instanceof OutlookTextExtactor);
assertTrue(extractor.getText().length() > 50);
extractor.close();
// Text
try {
FileInputStream stream = new FileInputStream(txt);
try {
ExtractorFactory.createExtractor(stream);
fail();
} finally {
IOUtils.closeQuietly(stream);
}
} catch (IllegalArgumentException e) {
// Good
}
}
use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project poi by apache.
the class TestExtractorFactory method testEmbeded.
/**
* Test embeded docs text extraction. For now, only
* does poifs embeded, but will do ooxml ones
* at some point.
*/
@Test
public void testEmbeded() throws Exception {
POIOLE2TextExtractor ext;
POITextExtractor[] embeds;
// No embedings
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xls);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(0, embeds.length);
ext.close();
// Excel
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xlsEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(6, embeds.length);
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(2, numPpt);
assertEquals(2, numXls);
assertEquals(2, numWord);
assertEquals(0, numMsg);
ext.close();
// Word
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(4, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(1, numPpt);
assertEquals(2, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Word which contains an OOXML file
ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmbOOXML);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
numWordX = 0;
assertEquals(3, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
else if (embed instanceof XWPFWordExtractor)
numWordX++;
}
assertEquals(1, numPpt);
assertEquals(1, numXls);
assertEquals(0, numWord);
assertEquals(1, numWordX);
assertEquals(0, numMsg);
ext.close();
// Outlook
ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Outlook with another outlook file in it
ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmbMsg);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0;
numXls = 0;
numPpt = 0;
numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor)
numPpt++;
else if (embed instanceof ExcelExtractor)
numXls++;
else if (embed instanceof WordExtractor)
numWord++;
else if (embed instanceof OutlookTextExtactor)
numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(0, numWord);
assertEquals(1, numMsg);
ext.close();
// TODO - PowerPoint
// TODO - Publisher
// TODO - Visio
}
use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project Gargoyle by callakrsos.
the class DocxFileParser method docxFileContentParser.
public String docxFileContentParser(String fileName) {
try {
FileInputStream fs = new FileInputStream(new File(fileName));
OPCPackage d = OPCPackage.open(fs);
if (fileName.endsWith(".docx")) {
XWPFWordExtractor xw = new XWPFWordExtractor(d);
return xw.getText();
} else if (fileName.endsWith(".pptx")) {
XSLFPowerPointExtractor xp = new XSLFPowerPointExtractor(d);
return xp.getText();
} else if (fileName.endsWith(".xlsx")) {
XSSFExcelExtractor xe = new XSSFExcelExtractor(d);
xe.setFormulasNotResults(true);
xe.setIncludeSheetNames(true);
return xe.getText();
}
} catch (Exception e) {
System.out.println("# DocxFileParser Error :" + e.getMessage());
}
return "";
}
use of org.apache.poi.xwpf.extractor.XWPFWordExtractor in project poi by apache.
the class ExtractorFactory method createExtractor.
/**
* Tries to determine the actual type of file and produces a matching text-extractor for it.
*
* @param pkg An {@link OPCPackage}.
* @return A {@link POIXMLTextExtractor} for the given file.
* @throws IOException If an error occurs while reading the file
* @throws OpenXML4JException If an error parsing the OpenXML file format is found.
* @throws XmlException If an XML parsing error occurs.
* @throws IllegalArgumentException If no matching file type could be found.
*/
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
try {
// Check for the normal Office core document
PackageRelationshipCollection core;
core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
// If nothing was found, try some of the other OOXML-based core types
if (core.size() == 0) {
// Could it be an OOXML-Strict one?
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
}
if (core.size() == 0) {
// Could it be a visio one?
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
if (core.size() == 1)
return new XDGFVisioExtractor(pkg);
}
// Should just be a single core document, complain if not
if (core.size() != 1) {
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
}
// Grab the core document part, and try to identify from that
final PackagePart corePart = pkg.getPart(core.getRelationship(0));
final String contentType = corePart.getContentType();
// Is it XSSF?
for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
if (rel.getContentType().equals(contentType)) {
if (getPreferEventExtractor()) {
return new XSSFEventBasedExcelExtractor(pkg);
}
return new XSSFExcelExtractor(pkg);
}
}
// Is it XWPF?
for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
if (rel.getContentType().equals(contentType)) {
return new XWPFWordExtractor(pkg);
}
}
// Is it XSLF?
for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
if (rel.getContentType().equals(contentType)) {
return new XSLFPowerPointExtractor(pkg);
}
}
// special handling for SlideShow-Theme-files,
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
}
// How about xlsb?
for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) {
if (rel.getContentType().equals(contentType)) {
return new XSSFBEventBasedExcelExtractor(pkg);
}
}
throw new IllegalArgumentException("No supported documents found in the OOXML package (found " + contentType + ")");
} catch (IOException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
} catch (OpenXML4JException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
} catch (XmlException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
} catch (RuntimeException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
}
}
Aggregations