use of org.apache.tika.parser.ParseContext in project tika by apache.
the class TikaResource method produceOutput.
private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap<String, String> httpHeaders, final UriInfo info, final String format) {
final Parser parser = createParser();
final Metadata metadata = new Metadata();
final ParseContext context = new ParseContext();
fillMetadata(parser, metadata, context, httpHeaders);
fillParseContext(context, httpHeaders, parser);
logRequest(LOG, info, metadata);
return new StreamingOutput() {
public void write(OutputStream outputStream) throws IOException, WebApplicationException {
Writer writer = new OutputStreamWriter(outputStream, UTF_8);
ContentHandler content;
try {
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, format);
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, UTF_8.name());
handler.setResult(new StreamResult(writer));
content = new ExpandedTitleContentHandler(handler);
} catch (TransformerConfigurationException e) {
throw new WebApplicationException(e);
}
parse(parser, LOG, info.getPath(), is, content, metadata, context);
}
};
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class UnpackerResource method process.
private Map<String, byte[]> process(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info, boolean saveAll) throws Exception {
Metadata metadata = new Metadata();
ParseContext pc = new ParseContext();
Parser parser = TikaResource.createParser();
if (parser instanceof DigestingParser) {
//no need to digest for unwrapping
parser = ((DigestingParser) parser).getWrappedParser();
}
TikaResource.fillMetadata(parser, metadata, pc, httpHeaders.getRequestHeaders());
TikaResource.logRequest(LOG, info, metadata);
ContentHandler ch;
ByteArrayOutputStream text = new ByteArrayOutputStream();
if (saveAll) {
ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, UTF_8)));
} else {
ch = new DefaultHandler();
}
Map<String, byte[]> files = new HashMap<>();
MutableInt count = new MutableInt();
pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));
TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc);
if (count.intValue() == 0 && !saveAll) {
throw new WebApplicationException(Response.Status.NO_CONTENT);
}
if (saveAll) {
files.put(TEXT_FILENAME, text.toByteArray());
ByteArrayOutputStream metaStream = new ByteArrayOutputStream();
metadataToCsv(metadata, metaStream);
files.put(META_FILENAME, metaStream.toByteArray());
}
return files;
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class HtmlParserTest method testIgnoreCharsetDetectorLanguage.
/**
* Test case for TIKA-339: Don't use language returned by CharsetDetector
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a>
*/
@Test
public void testIgnoreCharsetDetectorLanguage() throws Exception {
String test = "<html><title>Simple Content</title><body></body></html>";
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_LANGUAGE, "en");
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class HtmlParserTest method testImgUrlExtraction.
/**
* Test case for TIKA-463. Don't skip elements that have URLs.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
*/
@Test
public void testImgUrlExtraction() throws Exception {
final String test = "<html><head><title>Title</title>" + "<base href=\"http://domain.com\" />" + "</head><body><img src=\"image.jpg\" /></body></html>";
StringWriter sw = new StringWriter();
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw), new Metadata(), new ParseContext());
String result = sw.toString();
// <img> tag should exist, with fully resolved URL
assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class BatchProcessBuilder method build.
/**
* Builds a BatchProcess from runtime arguments and a
* input stream of a configuration file. With the exception of the QueueBuilder,
* the builders choose how to adjudicate between
* runtime arguments and the elements in the configuration file.
* <p/>
* This does not close the InputStream!
* @param is inputStream
* @param runtimeAttributes incoming runtime attributes
* @return batch process
* @throws java.io.IOException
*/
public BatchProcess build(InputStream is, Map<String, String> runtimeAttributes) throws IOException {
Document doc = null;
try {
DocumentBuilder docBuilder = new ParseContext().getDocumentBuilder();
doc = docBuilder.parse(is);
} catch (TikaException | SAXException e) {
throw new IOExceptionWithCause(e);
}
Node docElement = doc.getDocumentElement();
return build(docElement, runtimeAttributes);
}
Aggregations