use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class PDFParserTest method testEmbeddedFilesInChildren.
// TIKA-1228, TIKA-1268
@Test
public void testEmbeddedFilesInChildren() throws Exception {
String xml = getXML("/testPDF_childAttachments.pdf").xml;
//"regressiveness" exists only in Unit10.doc not in the container pdf document
assertTrue(xml.contains("regressiveness"));
RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
ParseContext context = new ParseContext();
PDFParserConfig config = new PDFParserConfig();
config.setExtractInlineImages(true);
config.setExtractUniqueInlineImagesOnly(false);
context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
context.set(org.apache.tika.parser.Parser.class, p);
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) {
p.parse(tis, new BodyContentHandler(-1), new Metadata(), context);
}
List<Metadata> metadatas = p.getMetadata();
assertEquals(5, metadatas.size());
assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("image0.jpg", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("Unit10.doc", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
assertEquals(MediaType.image("tiff").toString(), metadatas.get(2).get(Metadata.CONTENT_TYPE));
assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(3).get(Metadata.CONTENT_TYPE));
assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE));
}
use of org.apache.tika.io.TikaInputStream in project data-prep by Talend.
the class HtmlDetector method detect.
/**
* Reads an input stream and checks if it has a HTML format.
*
* The general contract of a detector is to not close the specified stream before returning. It is to the
* responsibility of the caller to close it. The detector should leverage the mark/reset feature of the specified
* {@see TikaInputStream} in order to let the stream always return the same bytes.
*
* @param metadata the specified TIKA {@link Metadata}
* @param inputStream the specified input stream
* @return either null or an HTML format
* @throws IOException
*/
@Override
public Format detect(Metadata metadata, TikaInputStream inputStream) throws IOException {
if (inputStream == null) {
return null;
} else {
inputStream.mark(FormatUtils.META_TAG_BUFFER_SIZE);
byte[] buffer = new byte[FormatUtils.META_TAG_BUFFER_SIZE];
int n = 0;
for (int m = inputStream.read(buffer); m != -1 && n < buffer.length; m = inputStream.read(buffer, n, buffer.length - n)) {
n += m;
}
inputStream.reset();
String head = FormatUtils.readFromBuffer(buffer, 0, n);
try (InputStream stream = TikaInputStream.get(IOUtils.toInputStream(head))) {
Charset charset = htmlEncodingDetector.detect(stream, metadata);
if (charset != null) {
return new Format(htmlFormatFamily, charset.name());
}
}
return null;
}
}
use of org.apache.tika.io.TikaInputStream in project data-prep by Talend.
the class CSVDetector method detect.
/**
* Reads an input stream and checks if it has a CSV format.
*
* The general contract of a detector is to not close the specified stream before returning. It is to the
* responsibility of the caller to close it. The detector should leverage the mark/reset feature of the specified
* {@see TikaInputStream} in order to let the stream always return the same bytes.
*
* @param metadata the specified TIKA {@link Metadata}
* @param inputStream the specified input stream
* @return either null or an CSV format
* @throws IOException
*/
@Override
public Format detect(Metadata metadata, TikaInputStream inputStream) throws IOException {
Format result = detectText(metadata, inputStream);
if (result == null) {
inputStream.mark(FormatUtils.META_TAG_BUFFER_SIZE);
byte[] buffer = new byte[FormatUtils.META_TAG_BUFFER_SIZE];
int n = 0;
for (int m = inputStream.read(buffer); m != -1 && n < buffer.length; m = inputStream.read(buffer, n, buffer.length - n)) {
n += m;
}
inputStream.reset();
String head = FormatUtils.readFromBuffer(buffer, 0, n);
try (InputStream stream = TikaInputStream.get(IOUtils.toInputStream(head))) {
result = detectText(new Metadata(), stream);
}
}
return result;
}
use of org.apache.tika.io.TikaInputStream in project spring-boot-quick by vector4wang.
the class TikaUtil method handleStreamContent.
public static String handleStreamContent(byte[] file) throws Exception {
Metadata md = new Metadata();
TikaInputStream input = TikaInputStream.get(file, md);
StringWriter textBuffer = new StringWriter();
StringBuilder metadataBuffer = new StringBuilder();
ContentHandler handler = new TeeContentHandler(getTextContentHandler(textBuffer));
parser.parse(input, handler, md, context);
return textBuffer.toString();
}
use of org.apache.tika.io.TikaInputStream in project uPortal by Jasig.
the class JaxbPortalDataHandlerService method getMediaType.
private MediaType getMediaType(BufferedInputStream inputStream, String fileName) throws IOException {
final TikaInputStream tikaInputStreamStream = TikaInputStream.get(new CloseShieldInputStream(inputStream));
try {
final Detector detector = new DefaultDetector();
final Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
final MediaType type = detector.detect(tikaInputStreamStream, metadata);
logger.debug("Determined '{}' for '{}'", type, fileName);
return type;
} catch (IOException e) {
logger.warn("Failed to determine media type for '" + fileName + "' assuming XML", e);
return null;
} finally {
IOUtils.closeQuietly(tikaInputStreamStream);
// Reset the buffered stream to make up for anything read by the detector
inputStream.reset();
}
}
Aggregations