use of org.apache.tika.parser.ParseContext in project tika by apache.
the class SolidworksParserTest method testDrawing2014SP0Parser.
/**
* Test the parsing of an solidWorks drawing in version 2014SP0
*/
@Test
public void testDrawing2014SP0Parser() throws Exception {
try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksDrawing2014SP0.SLDDRW")) {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
new OfficeParser().parse(input, handler, metadata, new ParseContext());
//Check content type
assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
//Check properties
assertEquals("2012-07-03T12:05:29Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
assertEquals("2013-11-28T12:41:49Z", metadata.get(Metadata.MODIFIED));
assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
assertEquals("", metadata.get(TikaCoreProperties.TITLE));
assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
}
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class SolidworksParserTest method testAssembly2013SP2Parser.
/**
* Test the parsing of an solidWorks assembly in version 2013SP2
*/
@Test
public void testAssembly2013SP2Parser() throws Exception {
try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksAssembly2013SP2.SLDASM")) {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
new OfficeParser().parse(input, handler, metadata, new ParseContext());
//Check content type
assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
//Check properties
assertEquals("2012-04-25T09:51:38Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
assertEquals("2013-09-06T08:11:08Z", metadata.get(Metadata.MODIFIED));
assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
assertEquals("", metadata.get(TikaCoreProperties.TITLE));
assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
}
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class Latin1StringsParserTest method testParse.
@Test
public void testParse() throws Exception {
String testStr = "These are Latin1 accented scripts: Â Ã É Ü â ã é ü";
String smallStr = "ab";
byte[] iso8859Bytes = testStr.getBytes(ISO_8859_1);
byte[] utf8Bytes = testStr.getBytes(UTF_8);
byte[] utf16Bytes = testStr.getBytes(UTF_16);
byte[] zeros = new byte[10];
byte[] smallString = smallStr.getBytes(ISO_8859_1);
byte[] trashBytes = { 0x00, 0x01, 0x02, 0x03, 0x1E, 0x1F, (byte) 0xFF };
ByteArrayOutputStream baos = new ByteArrayOutputStream();
baos.write(iso8859Bytes);
baos.write(zeros);
baos.write(utf8Bytes);
baos.write(trashBytes);
baos.write(utf16Bytes);
baos.write(zeros);
baos.write(smallString);
Parser parser = new Latin1StringsParser();
ContentHandler handler = new BodyContentHandler();
try (InputStream stream = new ByteArrayInputStream(baos.toByteArray())) {
parser.parse(stream, handler, new Metadata(), new ParseContext());
}
String result = handler.toString();
String expected = testStr + "\n" + testStr + "\n" + testStr + "\n";
// Test if result contains only the test string appended 3 times
assertTrue(result.equals(expected));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class EmptyAndDuplicateElementsXMLParserTest method testEmptiesAndRepeats.
@Test
public void testEmptiesAndRepeats() throws Exception {
try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream("/test-documents/testXML3.xml")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
assertEquals(4, metadata.getValues(FIRST_NAME).length);
assertEquals(4, metadata.getValues(LAST_NAME).length);
assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
assertEquals("", metadata.getValues(LAST_NAME)[2]);
assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
}
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class PhoneExtractingContentHandlerTest method testExtractPhoneNumbers.
@Test
public void testExtractPhoneNumbers() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
// The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
// to the underlying Handler.
PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
try (InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("/test-documents/testPhoneNumberExtractor.odt")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
String[] phoneNumbers = metadata.getValues("phonenumbers");
assertContains("9498888888", phoneNumbers[0]);
assertContains("9497777777", phoneNumbers[1]);
assertContains("9496666666", phoneNumbers[2]);
assertContains("9495555555", phoneNumbers[3]);
assertContains("4193404645", phoneNumbers[4]);
assertContains("9044687081", phoneNumbers[5]);
assertContains("2604094811", phoneNumbers[6]);
}
Aggregations