use of org.apache.poi.xwpf.usermodel.XWPFDocument in project poi by apache.
the class TestXWPFBugs method bug53475NoCSPName.
/**
* A word document that's encrypted with non-standard
* Encryption options, and no cspname section. See bug 53475
*/
@Test
public void bug53475NoCSPName() throws Exception {
File file = POIDataSamples.getDocumentInstance().getFile("bug53475-password-is-solrcell.docx");
NPOIFSFileSystem filesystem = new NPOIFSFileSystem(file, true);
// Check the encryption details
EncryptionInfo info = new EncryptionInfo(filesystem);
assertEquals(128, info.getHeader().getKeySize());
assertEquals(CipherAlgorithm.aes128, info.getHeader().getCipherAlgorithm());
assertEquals(HashAlgorithm.sha1, info.getHeader().getHashAlgorithmEx());
// Check it can be decoded
Decryptor d = Decryptor.getInstance(info);
assertTrue("Unable to process: document is encrypted", d.verifyPassword("solrcell"));
// Check we can read the word document in that
InputStream dataStream = d.getDataStream(filesystem);
OPCPackage opc = OPCPackage.open(dataStream);
XWPFDocument doc = new XWPFDocument(opc);
XWPFWordExtractor ex = new XWPFWordExtractor(doc);
String text = ex.getText();
assertNotNull(text);
assertEquals("This is password protected Word document.", text.trim());
ex.close();
filesystem.close();
}
use of org.apache.poi.xwpf.usermodel.XWPFDocument in project poi by apache.
the class TestExternalEntities method testFile.
/**
* Get text out of the simple file
*
* @throws IOException
*/
public void testFile() throws IOException {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("ExternalEntityInText.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
String text = extractor.getText();
assertTrue(text.length() > 0);
// Check contents, they should not contain the text from POI web site after colon!
assertEquals("Here should not be the POI web site: \"\"", text.trim());
extractor.close();
}
use of org.apache.poi.xwpf.usermodel.XWPFDocument in project poi by apache.
the class TestXWPFWordExtractor method testSimpleControlContent.
/**
* Test for basic extraction of SDT content
*
* @throws IOException
*/
public void testSimpleControlContent() throws IOException {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
String[] targs = new String[] { "header_rich_text", "rich_text", "rich_text_pre_table\nrich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nrich_text_post_table", "plain_text_no_newlines", "plain_text_with_newlines1\nplain_text_with_newlines2\n", "watermelon\n", "dirt\n", "4/16/2013\n", "rich_text_in_cell", "abc", "rich_text_in_paragraph_in_cell", "footer_rich_text", "footnote_sdt", "endnote_sdt" };
XWPFWordExtractor ex = new XWPFWordExtractor(doc);
String s = ex.getText().toLowerCase(Locale.ROOT);
int hits = 0;
for (String targ : targs) {
boolean hit = false;
if (s.contains(targ)) {
hit = true;
hits++;
}
assertEquals("controlled content loading-" + targ, true, hit);
}
assertEquals("controlled content loading hit count", targs.length, hits);
ex.close();
doc = XWPFTestDataSamples.openSampleDocument("Bug54771a.docx");
targs = new String[] { "bb", "test subtitle\n", "test user\n" };
ex = new XWPFWordExtractor(doc);
s = ex.getText().toLowerCase(Locale.ROOT);
//This ensures that there is only one copy.
for (String targ : targs) {
Matcher m = Pattern.compile(targ).matcher(s);
int hit = 0;
while (m.find()) {
hit++;
}
assertEquals("controlled content loading-" + targ, 1, hit);
}
//"test\n" appears twice: once as the "title" and once in the text.
//This also happens when you save this document as text from MSWord.
Matcher m = Pattern.compile("test\n").matcher(s);
int hit = 0;
while (m.find()) {
hit++;
}
assertEquals("test<N>", 2, hit);
ex.close();
}
use of org.apache.poi.xwpf.usermodel.XWPFDocument in project poi by apache.
the class TestXWPFWordExtractor method testGetComplexText.
/**
* Tests getting the text out of a complex file
*
* @throws IOException
*/
public void testGetComplexText() throws IOException {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("IllustrativeCases.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
String text = extractor.getText();
assertTrue(text.length() > 0);
char euro = '€';
// System.err.println("'"+text.substring(text.length() - 40) + "'");
// Check contents
assertStartsWith(text, " \n(V) ILLUSTRATIVE CASES\n\n");
assertContains(text, // \n\n\n"
"As well as gaining " + euro + "90 from child benefit increases, he will also receive the early childhood supplement of " + euro + "250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n");
assertEndsWith(text, "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n \n\n\n");
// Check number of paragraphs by counting number of newlines
int numberOfParagraphs = StringUtil.countMatches(text, '\n');
assertEquals(134, numberOfParagraphs);
extractor.close();
}
use of org.apache.poi.xwpf.usermodel.XWPFDocument in project poi by apache.
the class TestXWPFWordExtractor method testCheckboxes.
public void testCheckboxes() throws IOException {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("checkboxes.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
assertEquals("This is a small test for checkboxes \nunchecked: |_| \n" + "Or checked: |X|\n\n\n\n\n" + "Test a checkbox within a textbox: |_| -> |X|\n\n\n" + "In Table:\n|_|\t|X|\n\n\n" + "In Sequence:\n|X||_||X|\n", extractor.getText());
extractor.close();
}
Aggregations