use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class SolidworksParserTest method testDrawing2014SP0Parser.
/**
* Test the parsing of an solidWorks drawing in version 2014SP0
*/
@Test
public void testDrawing2014SP0Parser() throws Exception {
try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksDrawing2014SP0.SLDDRW")) {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
new OfficeParser().parse(input, handler, metadata, new ParseContext());
//Check content type
assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
//Check properties
assertEquals("2012-07-03T12:05:29Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
assertEquals("2013-11-28T12:41:49Z", metadata.get(Metadata.MODIFIED));
assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
assertEquals("", metadata.get(TikaCoreProperties.TITLE));
assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class SolidworksParserTest method testAssembly2013SP2Parser.
/**
* Test the parsing of an solidWorks assembly in version 2013SP2
*/
@Test
public void testAssembly2013SP2Parser() throws Exception {
try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksAssembly2013SP2.SLDASM")) {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
new OfficeParser().parse(input, handler, metadata, new ParseContext());
//Check content type
assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
//Check properties
assertEquals("2012-04-25T09:51:38Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
assertEquals("2013-09-06T08:11:08Z", metadata.get(Metadata.MODIFIED));
assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
assertEquals("", metadata.get(TikaCoreProperties.TITLE));
assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class Latin1StringsParserTest method testParse.
@Test
public void testParse() throws Exception {
String testStr = "These are Latin1 accented scripts: Â Ã É Ü â ã é ü";
String smallStr = "ab";
byte[] iso8859Bytes = testStr.getBytes(ISO_8859_1);
byte[] utf8Bytes = testStr.getBytes(UTF_8);
byte[] utf16Bytes = testStr.getBytes(UTF_16);
byte[] zeros = new byte[10];
byte[] smallString = smallStr.getBytes(ISO_8859_1);
byte[] trashBytes = { 0x00, 0x01, 0x02, 0x03, 0x1E, 0x1F, (byte) 0xFF };
ByteArrayOutputStream baos = new ByteArrayOutputStream();
baos.write(iso8859Bytes);
baos.write(zeros);
baos.write(utf8Bytes);
baos.write(trashBytes);
baos.write(utf16Bytes);
baos.write(zeros);
baos.write(smallString);
Parser parser = new Latin1StringsParser();
ContentHandler handler = new BodyContentHandler();
try (InputStream stream = new ByteArrayInputStream(baos.toByteArray())) {
parser.parse(stream, handler, new Metadata(), new ParseContext());
}
String result = handler.toString();
String expected = testStr + "\n" + testStr + "\n" + testStr + "\n";
// Test if result contains only the test string appended 3 times
assertTrue(result.equals(expected));
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class DcXMLParserTest method testXMLParserAsciiChars.
@Test
public void testXMLParserAsciiChars() throws Exception {
try (InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new DcXMLParser().parse(input, handler, metadata);
assertEquals("application/xml", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Tika test document", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Rida Benjelloun", metadata.get(TikaCoreProperties.CREATOR));
// The file contains 5 dc:subject tags, which come through as
// a multi-valued Tika Metadata entry in file order
assertEquals(true, metadata.isMultiValued(TikaCoreProperties.KEYWORDS));
assertEquals(5, metadata.getValues(TikaCoreProperties.KEYWORDS).length);
assertEquals("Java", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
assertEquals("XML", metadata.getValues(TikaCoreProperties.KEYWORDS)[1]);
assertEquals("XSLT", metadata.getValues(TikaCoreProperties.KEYWORDS)[2]);
assertEquals("JDOM", metadata.getValues(TikaCoreProperties.KEYWORDS)[3]);
assertEquals("Indexation", metadata.getValues(TikaCoreProperties.KEYWORDS)[4]);
assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT));
assertEquals(5, metadata.getValues(Metadata.SUBJECT).length);
assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]);
assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]);
assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]);
assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]);
assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]);
assertEquals("Framework d\'indexation des documents XML, HTML, PDF etc..", metadata.get(TikaCoreProperties.DESCRIPTION));
assertEquals("http://www.apache.org", metadata.get(TikaCoreProperties.IDENTIFIER));
assertEquals("test", metadata.get(TikaCoreProperties.TYPE));
assertEquals("application/msword", metadata.get(TikaCoreProperties.FORMAT));
assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE));
assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars"));
String content = handler.toString();
assertContains("Tika test document", content);
assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED));
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class EmptyAndDuplicateElementsXMLParserTest method testEmptiesAndRepeats.
@Test
public void testEmptiesAndRepeats() throws Exception {
try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream("/test-documents/testXML3.xml")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
assertEquals(4, metadata.getValues(FIRST_NAME).length);
assertEquals(4, metadata.getValues(LAST_NAME).length);
assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
assertEquals("", metadata.getValues(LAST_NAME)[2]);
assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
}
}
Aggregations