use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class ZlibParserTest method testZlibParsing.
@Test
public void testZlibParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = ZipParserTest.class.getResourceAsStream("/test-documents/testTXT.zlib")) {
parser.parse(stream, handler, metadata, recursingContext);
}
assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("Test d'indexation de Txt", content);
assertContains("http://www.apache.org", content);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class MboxParserTest method setUp.
@Before
public void setUp() throws Exception {
typeDetector = new TypeDetector();
autoDetectParser = new AutoDetectParser(typeDetector);
recursingContext = new ParseContext();
recursingContext.set(Parser.class, autoDetectParser);
mboxParser = new MboxParser();
mboxParser.setTracking(true);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class OutlookPSTParserTest method testParse.
@Test
public void testParse() throws Exception {
Parser pstParser = new AutoDetectParser();
Metadata metadata = new Metadata();
ContentHandler handler = new ToHTMLContentHandler();
ParseContext context = new ParseContext();
EmbeddedTrackingExtrator trackingExtrator = new EmbeddedTrackingExtrator(context);
context.set(EmbeddedDocumentExtractor.class, trackingExtrator);
context.set(Parser.class, new AutoDetectParser());
pstParser.parse(getResourceAsStream("/test-documents/testPST.pst"), handler, metadata, context);
String output = handler.toString();
assertFalse(output.isEmpty());
assertTrue(output.contains("<meta name=\"Content-Length\" content=\"271360\">"));
assertTrue(output.contains("<meta name=\"Content-Type\" content=\"application/vnd.ms-outlook-pst\">"));
assertTrue(output.contains("<body><div class=\"email-folder\"><h1>"));
assertTrue(output.contains("<div class=\"embedded\" id=\"<530D9CAC.5080901@gmail.com>\"><h1>Re: Feature Generators</h1>"));
assertTrue(output.contains("<div class=\"embedded\" id=\"<1393363252.28814.YahooMailNeo@web140906.mail.bf1.yahoo.com>\"><h1>Re: init tokenizer fails: \"Bad type in putfield/putstatic\"</h1>"));
assertTrue(output.contains("Gary Murphy commented on TIKA-1250:"));
assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour la recherche)</h1>"));
List<Metadata> metaList = trackingExtrator.trackingMetadata;
assertEquals(6, metaList.size());
Metadata firstMail = metaList.get(0);
assertEquals("Jörn Kottmann", firstMail.get(TikaCoreProperties.CREATOR));
assertEquals("Re: Feature Generators", firstMail.get(TikaCoreProperties.TITLE));
assertEquals("kottmann@gmail.com", firstMail.get("senderEmailAddress"));
assertEquals("users@opennlp.apache.org", firstMail.get("displayTo"));
assertEquals("", firstMail.get("displayCC"));
assertEquals("", firstMail.get("displayBCC"));
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class RFC822ParserTest method testSimple.
@Test
public void testSimple() throws Exception {
Parser parser = new RFC822Parser();
Metadata metadata = new Metadata();
InputStream stream = getStream("test-documents/testRFC822");
ContentHandler handler = mock(DefaultHandler.class);
ParseContext context = new ParseContext();
context.set(Parser.class, new AutoDetectParser());
try {
parser.parse(stream, handler, metadata, context);
verify(handler).startDocument();
//just one body
verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
//no multi-part body parts
verify(handler, never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
verify(handler).endDocument();
//note no leading spaces, and no quotes
assertEquals("Julien Nioche (JIRA) <jira@apache.org>", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(TikaCoreProperties.TITLE));
assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(Metadata.SUBJECT));
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class SQLite3ParserTest method testBasic.
@Test
public void testBasic() throws Exception {
Parser p = new AutoDetectParser();
//test different types of input streams
//actual inputstream, memory buffered bytearray and literal file
InputStream[] streams = new InputStream[3];
streams[0] = getResourceAsStream(TEST_FILE1);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(getResourceAsStream(TEST_FILE1), bos);
streams[1] = new ByteArrayInputStream(bos.toByteArray());
streams[2] = TikaInputStream.get(getResourceAsFile(TEST_FILE1));
int tests = 0;
for (InputStream stream : streams) {
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
//1) getXML closes the stream
//2) getXML runs recursively on the contents, so the embedded docs should show up
XMLResult result = getXML(stream, p, metadata);
stream.close();
String x = result.xml;
//first table name
assertContains("<table name=\"my_table1\"><thead><tr>\t<th>PK</th>", x);
//non-ascii
assertContains("<td>普林斯顿大学</td>", x);
//boolean
assertContains("<td>true</td>\t<td>2015-01-02</td>", x);
//date test
assertContains("2015-01-04", x);
//timestamp test
assertContains("2015-01-03 15:17:03", x);
//first embedded doc's image tag
assertContains("alt=\"image1.png\"", x);
//second embedded doc's image tag
assertContains("alt=\"A description...\"", x);
//second table name
assertContains("<table name=\"my_table2\"><thead><tr>\t<th>INT_COL2</th>", x);
Metadata post = result.metadata;
String[] tableNames = post.getValues(Database.TABLE_NAME);
assertEquals(2, tableNames.length);
assertEquals("my_table1", tableNames[0]);
assertEquals("my_table2", tableNames[1]);
tests++;
}
assertEquals(3, tests);
}
Aggregations