use of org.apache.tika.parser.ParseContext in project tika by apache.
the class ExcelParserTest method testJXL.
@Test
public void testJXL() throws Exception {
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/jxl.xls")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
new OfficeParser().parse(input, handler, metadata, context);
assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("Number Formats", content);
}
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class RFC822ParserTest method testI18NHeaders.
@Test
public void testI18NHeaders() {
Parser parser = new RFC822Parser();
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822_i18nheaders");
ContentHandler handler = mock(DefaultHandler.class);
try {
parser.parse(stream, handler, metadata, new ParseContext());
//tests correct decoding of internationalized headers, both
//quoted-printable (Q) and Base64 (B).
assertEquals("Keld Jørn Simonsen <keld@dkuug.dk>", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("If you can read this you understand the example.", metadata.get(TikaCoreProperties.TITLE));
assertEquals("If you can read this you understand the example.", metadata.get(Metadata.SUBJECT));
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class RFC822ParserTest method getDate.
private Date getDate(String dateString) throws Exception {
String mail = "From: dev@tika.apache.org\n" + "Date: " + dateString + "\n";
Parser p = new RFC822Parser();
Metadata m = new Metadata();
try (InputStream is = TikaInputStream.get(mail.getBytes(StandardCharsets.UTF_8))) {
p.parse(is, new DefaultHandler(), m, new ParseContext());
}
return m.getDate(TikaCoreProperties.CREATED);
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class RFC822ParserTest method testSimple.
@Test
public void testSimple() throws Exception {
Parser parser = new RFC822Parser();
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822");
ContentHandler handler = mock(DefaultHandler.class);
ParseContext context = new ParseContext();
context.set(Parser.class, new AutoDetectParser());
try {
parser.parse(stream, handler, metadata, context);
verify(handler).startDocument();
//just one body
verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
//no multi-part body parts
verify(handler, never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
verify(handler).endDocument();
//note no leading spaces, and no quotes
assertEquals("Julien Nioche (JIRA) <jira@apache.org>", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(TikaCoreProperties.TITLE));
assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(Metadata.SUBJECT));
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class RFC822ParserTest method testUnusualFromAddress.
/**
* The from isn't in the usual form.
* See TIKA-618
*/
@Test
public void testUnusualFromAddress() throws Exception {
Parser parser = new RFC822Parser();
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822_oddfrom");
ContentHandler handler = mock(DefaultHandler.class);
parser.parse(stream, handler, metadata, new ParseContext());
assertEquals("Saved by Windows Internet Explorer 7", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Air Permit Programs | Air & Radiation | US EPA", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Air Permit Programs | Air & Radiation | US EPA", metadata.get(Metadata.SUBJECT));
}
Aggregations