use of org.apache.poi.xwpf.usermodel.XWPFDocument in project poi by apache.
the class TestXWPFWordExtractor method testInsertedDeletedText.
public void testInsertedDeletedText() throws IOException {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("delins.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
assertContains(extractor.getText(), "pendant worn");
assertContains(extractor.getText(), "extremely well");
extractor.close();
}
use of org.apache.poi.xwpf.usermodel.XWPFDocument in project poi by apache.
the class TestXWPFWordExtractor method testDOCMFiles.
/**
* Test that we can open and process .docm
* (macro enabled) docx files (bug #45690)
*
* @throws IOException
*/
public void testDOCMFiles() throws IOException {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("45690.docm");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
assertContains(extractor.getText(), "2004");
assertContains(extractor.getText(), "2008");
assertContains(extractor.getText(), "(120 ");
extractor.close();
}
use of org.apache.poi.xwpf.usermodel.XWPFDocument in project poi by apache.
the class TestXWPFWordExtractor method testHeadersFooters.
public void testHeadersFooters() throws IOException {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("ThreeColHeadFoot.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
assertEquals("First header column!\tMid header\tRight header!\n" + "This is a sample word document. It has two pages. It has a three column heading, and a three column footer\n" + "\n" + "HEADING TEXT\n" + "\n" + "More on page one\n" + "\n\n" + "End of page 1\n\n\n" + "This is page two. It also has a three column heading, and a three column footer.\n" + "Footer Left\tFooter Middle\tFooter Right\n", extractor.getText());
// Now another file, expect multiple headers
// and multiple footers
doc = XWPFTestDataSamples.openSampleDocument("DiffFirstPageHeadFoot.docx");
extractor.close();
extractor = new XWPFWordExtractor(doc);
extractor.close();
extractor = new XWPFWordExtractor(doc);
extractor.getText();
assertEquals("I am the header on the first page, and I" + '’' + "m nice and simple\n" + "First header column!\tMid header\tRight header!\n" + "This is a sample word document. It has two pages. It has a simple header and footer, which is different to all the other pages.\n" + "\n" + "HEADING TEXT\n" + "\n" + "More on page one\n" + "\n\n" + "End of page 1\n\n\n" + "This is page two. It also has a three column heading, and a three column footer.\n" + "The footer of the first page\n" + "Footer Left\tFooter Middle\tFooter Right\n", extractor.getText());
extractor.close();
}
use of org.apache.poi.xwpf.usermodel.XWPFDocument in project tika by apache.
the class OOXMLExtractorFactory method parse.
public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
Locale locale = context.get(Locale.class, Locale.getDefault());
ExtractorFactory.setThreadPrefersEventExtractors(true);
try {
OOXMLExtractor extractor;
OPCPackage pkg;
// Locate or Open the OPCPackage for the file
TikaInputStream tis = TikaInputStream.cast(stream);
if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
pkg = (OPCPackage) tis.getOpenContainer();
} else if (tis != null && tis.hasFile()) {
pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
tis.setOpenContainer(pkg);
} else {
InputStream shield = new CloseShieldInputStream(stream);
pkg = OPCPackage.open(shield);
}
// Get the type, and ensure it's one we handle
MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
// Not a supported type, delegate to Empty Parser
EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
return;
}
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// Have the appropriate OOXML text extractor picked
POIXMLTextExtractor poiExtractor = null;
// This has already been set by OOXMLParser's call to configure()
// We can rely on this being non-null.
OfficeParserConfig config = context.get(OfficeParserConfig.class);
if (config.getUseSAXDocxExtractor()) {
poiExtractor = trySXWPF(pkg);
}
if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
poiExtractor = trySXSLF(pkg);
}
if (poiExtractor == null) {
poiExtractor = ExtractorFactory.createExtractor(pkg);
}
POIXMLDocument document = poiExtractor.getDocument();
if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor);
metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
} else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor);
metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
} else if (document == null) {
throw new TikaException("Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor);
} else if (document instanceof XMLSlideShow) {
extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
} else if (document instanceof XWPFDocument) {
extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
} else {
extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
}
// Get the bulk of the metadata first, so that it's accessible during
// parsing if desired by the client (see TIKA-1109)
extractor.getMetadataExtractor().extract(metadata);
// Extract the text, along with any in-document metadata
extractor.getXHTML(baseHandler, metadata, context);
} catch (IllegalArgumentException e) {
if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
throw new TikaException("TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
} else {
throw new TikaException("Error creating OOXML extractor", e);
}
} catch (InvalidFormatException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (OpenXML4JException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (XmlException e) {
throw new TikaException("Error creating OOXML extractor", e);
}
}
use of org.apache.poi.xwpf.usermodel.XWPFDocument in project Gargoyle by callakrsos.
the class MSWordTest method toPdf.
@Test
public final void toPdf() throws FileNotFoundException, IOException {
File file = new File("C:\\Users\\KYJ\\Desktop\\학습\\Algorism.docx");
XWPFDocument document = new XWPFDocument(new FileInputStream(file));
File outFile = new File("Algorism.pdf");
try (OutputStream out = new FileOutputStream(outFile)) {
PdfOptions options = PdfOptions.getDefault();
PdfConverter.getInstance().convert(document, out, options);
}
System.out.println("Sucess");
FileUtil.openFile(outFile);
}
Aggregations