use of org.apache.tika.sax.TeeContentHandler in project tika by apache.
the class ParserPostProcessor method parse.
/**
* Forwards the call to the delegated parser and post-processes the
* results as described above.
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
ContentHandler body = new BodyContentHandler();
ContentHandler tee = new TeeContentHandler(handler, body);
super.parse(stream, tee, metadata, context);
String content = body.toString();
metadata.set("fulltext", content);
int length = Math.min(content.length(), 500);
metadata.set("summary", content.substring(0, length));
for (String link : RegexUtils.extractLinks(content)) {
metadata.add("outlinks", link);
}
}
use of org.apache.tika.sax.TeeContentHandler in project tika by apache.
the class OpenDocumentMetaParser method getStatistic.
private static ContentHandler getStatistic(ContentHandler ch, Metadata md, Property property, String attribute) {
Matcher matcher = META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
ContentHandler branch = new MatchingContentHandler(new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
return new TeeContentHandler(ch, branch);
}
use of org.apache.tika.sax.TeeContentHandler in project tika by apache.
the class TIAParsingExample method testTeeContentHandler.
public static void testTeeContentHandler(String filename) throws Exception {
InputStream stream = new ByteArrayInputStream(new byte[0]);
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
Parser parser = new AutoDetectParser();
LinkContentHandler linkCollector = new LinkContentHandler();
try (OutputStream output = new FileOutputStream(new File(filename))) {
ContentHandler handler = new TeeContentHandler(new BodyContentHandler(output), linkCollector);
parser.parse(stream, handler, metadata, context);
}
}
use of org.apache.tika.sax.TeeContentHandler in project tika by apache.
the class HtmlParserTest method testParseAscii.
@Test
public void testParseAscii() throws Exception {
String path = "/test-documents/testHTML.html";
final StringWriter href = new StringWriter();
final StringWriter name = new StringWriter();
ContentHandler body = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = HtmlParserTest.class.getResourceAsStream(path)) {
ContentHandler link = new DefaultHandler() {
@Override
public void startElement(String u, String l, String n, Attributes a) throws SAXException {
if ("a".equals(l)) {
if (a.getValue("href") != null) {
href.append(a.getValue("href"));
} else if (a.getValue("name") != null) {
name.append(a.getValue("name"));
}
}
}
};
new HtmlParser().parse(stream, new TeeContentHandler(body, link), metadata, new ParseContext());
}
assertEquals("Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Tika Developers", metadata.get("Author"));
assertEquals("5", metadata.get("refresh"));
assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
assertEquals("http://www.apache.org/", href.toString());
assertEquals("test-anchor", name.toString());
String content = body.toString();
assertTrue("Did not contain expected text:" + "Test Indexation Html", content.contains("Test Indexation Html"));
assertTrue("Did not contain expected text:" + "Indexation du fichier", content.contains("Indexation du fichier"));
}
use of org.apache.tika.sax.TeeContentHandler in project spring-boot-quick by vector4wang.
the class TikaUtil method handleStreamMetaDate.
public static Map<String, String> handleStreamMetaDate(byte[] file) throws Exception {
Map<String, String> meta = new HashMap<>();
Metadata md = new Metadata();
TikaInputStream input = TikaInputStream.get(file, md);
StringWriter textBuffer = new StringWriter();
ContentHandler handler = new TeeContentHandler(getTextContentHandler(textBuffer));
parser.parse(input, handler, md, context);
String[] names = md.names();
Arrays.sort(names);
for (String name : names) {
meta.put(name, md.get(name));
}
return meta;
}
Aggregations