use of org.apache.tika.sax.TeeContentHandler in project tika by apache.
the class OpenDocumentMetaParser method getContentHandler.
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
// We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
// Process the Dublin Core Attributes
ch = new TeeContentHandler(super.getContentHandler(ch, md, context), getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"), getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"), getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"), getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"), getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"), getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"), getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"), getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"), getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"), getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
// Process the OO Meta Attributes
ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
// ODF uses dc:date for modified
ch = new TeeContentHandler(ch, new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, "date", md, TikaCoreProperties.MODIFIED));
// ODF uses dc:subject for description
ch = new TeeContentHandler(ch, new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, "subject", md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
ch = getMeta(ch, md, Property.externalText("generator"), "generator");
// Process the user defined Meta Attributes
ch = getUserDefined(ch, md);
// Process the OO Statistics Attributes
ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
// Legacy, Tika-1.0 style attributes
// TODO Remove these in Tika 2.0
ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
// Legacy Statistics Attributes, replaced with real keys above
// TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
ch = getStatistic(ch, md, "nbPage", "page-count");
ch = getStatistic(ch, md, "nbPara", "paragraph-count");
ch = getStatistic(ch, md, "nbWord", "word-count");
ch = getStatistic(ch, md, "nbCharacter", "character-count");
ch = getStatistic(ch, md, "nbTab", "table-count");
ch = getStatistic(ch, md, "nbObject", "object-count");
ch = getStatistic(ch, md, "nbImg", "image-count");
// Normalise the rest
ch = new NSNormalizerContentHandler(ch);
return ch;
}
use of org.apache.tika.sax.TeeContentHandler in project tika by apache.
the class OpenDocumentMetaParser method getMeta.
private static ContentHandler getMeta(ContentHandler ch, Metadata md, Property property, String element) {
Matcher matcher = new CompositeMatcher(META_XPATH.parse("//meta:" + element), META_XPATH.parse("//meta:" + element + "//text()"));
ContentHandler branch = new MatchingContentHandler(new MetadataHandler(md, property), matcher);
return new TeeContentHandler(ch, branch);
}
use of org.apache.tika.sax.TeeContentHandler in project tika by apache.
the class OpenDocumentMetaParser method getStatistic.
@Deprecated
private static ContentHandler getStatistic(ContentHandler ch, Metadata md, String name, String attribute) {
Matcher matcher = META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
ContentHandler branch = new MatchingContentHandler(new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
return new TeeContentHandler(ch, branch);
}
use of org.apache.tika.sax.TeeContentHandler in project tika by apache.
the class OpenDocumentMetaParser method getUserDefined.
private static ContentHandler getUserDefined(ContentHandler ch, Metadata md) {
Matcher matcher = new CompositeMatcher(META_XPATH.parse("//meta:user-defined/@meta:name"), META_XPATH.parse("//meta:user-defined//text()"));
// eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
ContentHandler branch = new MatchingContentHandler(new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX), matcher);
return new TeeContentHandler(ch, branch);
}
use of org.apache.tika.sax.TeeContentHandler in project tika by apache.
the class PrescriptionParser method getContentHandler.
@Override
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
String xpd = "http://example.com/2011/xpd";
ContentHandler doctor = new ElementMetadataHandler(xpd, "doctor", metadata, "xpd:doctor");
ContentHandler patient = new ElementMetadataHandler(xpd, "patient", metadata, "xpd:patient");
return new TeeContentHandler(super.getContentHandler(handler, metadata, context), doctor, patient);
}
Aggregations