Search in sources :

Example 31 with XWPFDocument

use of org.apache.poi.xwpf.usermodel.XWPFDocument in project poi by apache.

the class TestXWPFBugs method bug53475NoCSPName.

/**
     * A word document that's encrypted with non-standard
     * Encryption options, and no cspname section. See bug 53475
     */
@Test
public void bug53475NoCSPName() throws Exception {
    File file = POIDataSamples.getDocumentInstance().getFile("bug53475-password-is-solrcell.docx");
    NPOIFSFileSystem filesystem = new NPOIFSFileSystem(file, true);
    // Check the encryption details
    EncryptionInfo info = new EncryptionInfo(filesystem);
    assertEquals(128, info.getHeader().getKeySize());
    assertEquals(CipherAlgorithm.aes128, info.getHeader().getCipherAlgorithm());
    assertEquals(HashAlgorithm.sha1, info.getHeader().getHashAlgorithmEx());
    // Check it can be decoded
    Decryptor d = Decryptor.getInstance(info);
    assertTrue("Unable to process: document is encrypted", d.verifyPassword("solrcell"));
    // Check we can read the word document in that
    InputStream dataStream = d.getDataStream(filesystem);
    OPCPackage opc = OPCPackage.open(dataStream);
    XWPFDocument doc = new XWPFDocument(opc);
    XWPFWordExtractor ex = new XWPFWordExtractor(doc);
    String text = ex.getText();
    assertNotNull(text);
    assertEquals("This is password protected Word document.", text.trim());
    ex.close();
    filesystem.close();
}
Also used : NPOIFSFileSystem(org.apache.poi.poifs.filesystem.NPOIFSFileSystem) Decryptor(org.apache.poi.poifs.crypt.Decryptor) EncryptionInfo(org.apache.poi.poifs.crypt.EncryptionInfo) InputStream(java.io.InputStream) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument) File(java.io.File) ZipFile(java.util.zip.ZipFile) OPCPackage(org.apache.poi.openxml4j.opc.OPCPackage) Test(org.junit.Test)

Example 32 with XWPFDocument

use of org.apache.poi.xwpf.usermodel.XWPFDocument in project poi by apache.

the class TestExternalEntities method testFile.

/**
     * Get text out of the simple file
     *
     * @throws IOException
     */
public void testFile() throws IOException {
    XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("ExternalEntityInText.docx");
    XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
    String text = extractor.getText();
    assertTrue(text.length() > 0);
    // Check contents, they should not contain the text from POI web site after colon!
    assertEquals("Here should not be the POI web site: \"\"", text.trim());
    extractor.close();
}
Also used : XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument)

Example 33 with XWPFDocument

use of org.apache.poi.xwpf.usermodel.XWPFDocument in project poi by apache.

the class TestXWPFWordExtractor method testSimpleControlContent.

/**
     * Test for basic extraction of SDT content
     *
     * @throws IOException
     */
public void testSimpleControlContent() throws IOException {
    XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
    String[] targs = new String[] { "header_rich_text", "rich_text", "rich_text_pre_table\nrich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nrich_text_post_table", "plain_text_no_newlines", "plain_text_with_newlines1\nplain_text_with_newlines2\n", "watermelon\n", "dirt\n", "4/16/2013\n", "rich_text_in_cell", "abc", "rich_text_in_paragraph_in_cell", "footer_rich_text", "footnote_sdt", "endnote_sdt" };
    XWPFWordExtractor ex = new XWPFWordExtractor(doc);
    String s = ex.getText().toLowerCase(Locale.ROOT);
    int hits = 0;
    for (String targ : targs) {
        boolean hit = false;
        if (s.contains(targ)) {
            hit = true;
            hits++;
        }
        assertEquals("controlled content loading-" + targ, true, hit);
    }
    assertEquals("controlled content loading hit count", targs.length, hits);
    ex.close();
    doc = XWPFTestDataSamples.openSampleDocument("Bug54771a.docx");
    targs = new String[] { "bb", "test subtitle\n", "test user\n" };
    ex = new XWPFWordExtractor(doc);
    s = ex.getText().toLowerCase(Locale.ROOT);
    //This ensures that there is only one copy.
    for (String targ : targs) {
        Matcher m = Pattern.compile(targ).matcher(s);
        int hit = 0;
        while (m.find()) {
            hit++;
        }
        assertEquals("controlled content loading-" + targ, 1, hit);
    }
    //"test\n" appears twice: once as the "title" and once in the text.
    //This also happens when you save this document as text from MSWord.
    Matcher m = Pattern.compile("test\n").matcher(s);
    int hit = 0;
    while (m.find()) {
        hit++;
    }
    assertEquals("test<N>", 2, hit);
    ex.close();
}
Also used : Matcher(java.util.regex.Matcher) XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument)

Example 34 with XWPFDocument

use of org.apache.poi.xwpf.usermodel.XWPFDocument in project poi by apache.

the class TestXWPFWordExtractor method testGetComplexText.

/**
     * Tests getting the text out of a complex file
     *
     * @throws IOException
     */
public void testGetComplexText() throws IOException {
    XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("IllustrativeCases.docx");
    XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
    String text = extractor.getText();
    assertTrue(text.length() > 0);
    char euro = '€';
    //		System.err.println("'"+text.substring(text.length() - 40) + "'");
    // Check contents
    assertStartsWith(text, "  \n(V) ILLUSTRATIVE CASES\n\n");
    assertContains(text, // \n\n\n"
    "As well as gaining " + euro + "90 from child benefit increases, he will also receive the early childhood supplement of " + euro + "250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n");
    assertEndsWith(text, "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n \n\n\n");
    // Check number of paragraphs by counting number of newlines
    int numberOfParagraphs = StringUtil.countMatches(text, '\n');
    assertEquals(134, numberOfParagraphs);
    extractor.close();
}
Also used : XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument)

Example 35 with XWPFDocument

use of org.apache.poi.xwpf.usermodel.XWPFDocument in project poi by apache.

the class TestXWPFWordExtractor method testCheckboxes.

public void testCheckboxes() throws IOException {
    XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("checkboxes.docx");
    XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
    assertEquals("This is a small test for checkboxes \nunchecked: |_| \n" + "Or checked: |X|\n\n\n\n\n" + "Test a checkbox within a textbox: |_| -> |X|\n\n\n" + "In Table:\n|_|\t|X|\n\n\n" + "In Sequence:\n|X||_||X|\n", extractor.getText());
    extractor.close();
}
Also used : XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument)

Aggregations

XWPFDocument (org.apache.poi.xwpf.usermodel.XWPFDocument)51 Test (org.junit.Test)15 FileOutputStream (java.io.FileOutputStream)11 File (java.io.File)9 XWPFParagraph (org.apache.poi.xwpf.usermodel.XWPFParagraph)9 XWPFRun (org.apache.poi.xwpf.usermodel.XWPFRun)9 InputStream (java.io.InputStream)6 OutputStream (java.io.OutputStream)6 FileInputStream (java.io.FileInputStream)4 XWPFTable (org.apache.poi.xwpf.usermodel.XWPFTable)4 OPCPackage (org.apache.poi.openxml4j.opc.OPCPackage)3 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)3 XMLSlideShow (org.apache.poi.xslf.usermodel.XMLSlideShow)3 XWPFWordExtractor (org.apache.poi.xwpf.extractor.XWPFWordExtractor)3 XWPFFooter (org.apache.poi.xwpf.usermodel.XWPFFooter)3 XWPFHeader (org.apache.poi.xwpf.usermodel.XWPFHeader)3 ByteArrayInputStream (java.io.ByteArrayInputStream)2 ZipFile (java.util.zip.ZipFile)2 HSLFSlideShow (org.apache.poi.hslf.usermodel.HSLFSlideShow)2 HSSFWorkbook (org.apache.poi.hssf.usermodel.HSSFWorkbook)2