use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class JournalParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
File tmpFile = tis.getFile();
GrobidRESTParser grobidParser = new GrobidRESTParser();
grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context);
PDFParser parser = new PDFParser();
parser.parse(new FileInputStream(tmpFile), handler, metadata, context);
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class JpegParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
new JempboxExtractor(metadata).parse(tis);
} finally {
tmp.dispose();
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class DetectorResource method detect.
@PUT
@Path("stream")
@Consumes("*/*")
@Produces("text/plain")
public String detect(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
Metadata met = new Metadata();
TikaInputStream tis = TikaInputStream.get(TikaResource.getInputStream(is, httpHeaders));
String filename = TikaResource.detectFilename(httpHeaders.getRequestHeaders());
LOG.info("Detecting media type for Filename: {}", filename);
met.add(Metadata.RESOURCE_NAME_KEY, filename);
try {
return TikaResource.getConfig().getDetector().detect(tis, met).toString();
} catch (IOException e) {
LOG.warn("Unable to detect MIME type for file. Reason: {}", e.getMessage(), e);
return MediaType.OCTET_STREAM.toString();
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class RTFParserTest method testEmbeddedLinkedDocument.
//TIKA-1010 test linked embedded doc
@Test
public void testEmbeddedLinkedDocument() throws Exception {
Set<MediaType> skipTypes = new HashSet<MediaType>();
skipTypes.add(MediaType.parse("image/emf"));
skipTypes.add(MediaType.parse("image/wmf"));
TrackingHandler tracker = new TrackingHandler(skipTypes);
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
ContainerExtractor ex = new ParserContainerExtractor();
assertEquals(true, ex.isSupported(tis));
ex.extract(tis, ex, tracker);
}
//should gracefully skip link and not throw NPE, IOEx, etc
assertEquals(0, tracker.filenames.size());
tracker = new TrackingHandler();
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
ContainerExtractor ex = new ParserContainerExtractor();
assertEquals(true, ex.isSupported(tis));
ex.extract(tis, ex, tracker);
}
//should gracefully skip link and not throw NPE, IOEx, etc
assertEquals(2, tracker.filenames.size());
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class RTFParserTest method testRegularImages.
//TIKA-1010 test regular (not "embedded") images/picts
@Test
public void testRegularImages() throws Exception {
Parser base = new AutoDetectParser();
ParseContext ctx = new ParseContext();
RecursiveParserWrapper parser = new RecursiveParserWrapper(base, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
ContentHandler handler = new BodyContentHandler();
Metadata rootMetadata = new Metadata();
rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
parser.parse(tis, handler, rootMetadata, ctx);
}
List<Metadata> metadatas = parser.getMetadata();
//("testJPEG_EXIF_普林斯顿.jpg");
Metadata meta_jpg_exif = metadatas.get(1);
//("testJPEG_普林斯顿.jpg");
Metadata meta_jpg = metadatas.get(3);
assertTrue(meta_jpg_exif != null);
assertTrue(meta_jpg != null);
assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache"));
//make sure old metadata doesn't linger between objects
assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
assertEquals(49, meta_jpg.names().length);
assertEquals(113, meta_jpg_exif.names().length);
}
Aggregations