use of org.apache.tika.metadata.Metadata in project tika by apache.
the class TikaResource method createParser.
@SuppressWarnings("serial")
public static Parser createParser() {
final Parser parser = new AutoDetectParser(tikaConfig);
Map<MediaType, Parser> parsers = ((AutoDetectParser) parser).getParsers();
parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
((AutoDetectParser) parser).setParsers(parsers);
((AutoDetectParser) parser).setFallback(new Parser() {
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
return parser.getSupportedTypes(parseContext);
}
public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) {
throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
}
});
if (digester != null) {
return new DigestingParser(parser, digester);
}
return parser;
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class TikaResource method fillMetadata.
@SuppressWarnings("serial")
public static void fillMetadata(Parser parser, Metadata metadata, ParseContext context, MultivaluedMap<String, String> httpHeaders) {
String fileName = detectFilename(httpHeaders);
if (fileName != null) {
metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
}
String contentTypeHeader = httpHeaders.getFirst(HttpHeaders.CONTENT_TYPE);
javax.ws.rs.core.MediaType mediaType = contentTypeHeader == null ? null : javax.ws.rs.core.MediaType.valueOf(contentTypeHeader);
if (mediaType != null && "xml".equals(mediaType.getSubtype())) {
mediaType = null;
}
if (mediaType != null && mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) {
mediaType = null;
}
if (mediaType != null) {
metadata.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, mediaType.toString());
final Detector detector = getDetector(parser);
setDetector(parser, new Detector() {
public MediaType detect(InputStream inputStream, Metadata metadata) throws IOException {
String ct = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
//make sure never to return null -- TIKA-1845
MediaType type = null;
if (ct != null) {
//this can return null if ct is not a valid mime type
type = MediaType.parse(ct);
}
if (type != null) {
return type;
} else {
return detector.detect(inputStream, metadata);
}
}
});
}
final String password = httpHeaders.getFirst("Password");
if (password != null) {
context.set(PasswordProvider.class, new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return password;
}
});
}
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class TikaResource method produceOutput.
private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap<String, String> httpHeaders, final UriInfo info, final String format) {
final Parser parser = createParser();
final Metadata metadata = new Metadata();
final ParseContext context = new ParseContext();
fillMetadata(parser, metadata, context, httpHeaders);
fillParseContext(context, httpHeaders, parser);
logRequest(LOG, info, metadata);
return new StreamingOutput() {
public void write(OutputStream outputStream) throws IOException, WebApplicationException {
Writer writer = new OutputStreamWriter(outputStream, UTF_8);
ContentHandler content;
try {
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, format);
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, UTF_8.name());
handler.setResult(new StreamResult(writer));
content = new ExpandedTitleContentHandler(handler);
} catch (TransformerConfigurationException e) {
throw new WebApplicationException(e);
}
parse(parser, LOG, info.getPath(), is, content, metadata, context);
}
};
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class UnpackerResource method process.
private Map<String, byte[]> process(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info, boolean saveAll) throws Exception {
Metadata metadata = new Metadata();
ParseContext pc = new ParseContext();
Parser parser = TikaResource.createParser();
if (parser instanceof DigestingParser) {
//no need to digest for unwrapping
parser = ((DigestingParser) parser).getWrappedParser();
}
TikaResource.fillMetadata(parser, metadata, pc, httpHeaders.getRequestHeaders());
TikaResource.logRequest(LOG, info, metadata);
ContentHandler ch;
ByteArrayOutputStream text = new ByteArrayOutputStream();
if (saveAll) {
ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, UTF_8)));
} else {
ch = new DefaultHandler();
}
Map<String, byte[]> files = new HashMap<>();
MutableInt count = new MutableInt();
pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));
TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc);
if (count.intValue() == 0 && !saveAll) {
throw new WebApplicationException(Response.Status.NO_CONTENT);
}
if (saveAll) {
files.put(TEXT_FILENAME, text.toByteArray());
ByteArrayOutputStream metaStream = new ByteArrayOutputStream();
metadataToCsv(metadata, metaStream);
files.put(META_FILENAME, metaStream.toByteArray());
}
return files;
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class RecursiveMetadataResourceTest method testHandlerTypeInMultipartXML.
@Test
public void testHandlerTypeInMultipartXML() throws Exception {
//default unspecified
Attachment attachmentPart = new Attachment("myworddocx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
WebClient webClient = WebClient.create(endPoint + META_PATH + FORM_PATH);
Response response = webClient.type("multipart/form-data").accept("application/json").post(attachmentPart);
Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
String content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
//unparseable
attachmentPart = new Attachment("myworddocx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + UNPARSEABLE_PATH);
response = webClient.type("multipart/form-data").accept("application/json").post(attachmentPart);
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
//xml
attachmentPart = new Attachment("myworddocx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + XML_PATH);
response = webClient.type("multipart/form-data").accept("application/json").post(attachmentPart);
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
//text
attachmentPart = new Attachment("myworddocx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + TEXT_PATH);
response = webClient.type("multipart/form-data").accept("application/json").post(attachmentPart);
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
assertTrue(content.startsWith("embed_3"));
//ignore -- no content
attachmentPart = new Attachment("myworddocx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + IGNORE_PATH);
response = webClient.type("multipart/form-data").accept("application/json").query("handler", "ignore").post(attachmentPart);
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
assertNull(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT));
}
Aggregations