Search in sources :

Example 1 with ElementMetadataHandler

use of org.apache.tika.parser.xml.ElementMetadataHandler in project tika by apache.

the class OpenDocumentMetaParser method getContentHandler.

protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
    // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
    // Process the Dublin Core Attributes 
    ch = new TeeContentHandler(super.getContentHandler(ch, md, context), getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"), getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"), getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"), getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"), getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"), getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"), getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"), getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"), getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"), getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
    // Process the OO Meta Attributes
    ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
    // ODF uses dc:date for modified
    ch = new TeeContentHandler(ch, new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, "date", md, TikaCoreProperties.MODIFIED));
    // ODF uses dc:subject for description
    ch = new TeeContentHandler(ch, new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, "subject", md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
    ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
    ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
    ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
    ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
    ch = getMeta(ch, md, Property.externalText("generator"), "generator");
    // Process the user defined Meta Attributes
    ch = getUserDefined(ch, md);
    // Process the OO Statistics Attributes
    ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
    ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
    ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
    ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
    ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
    ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
    ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
    ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
    // Legacy, Tika-1.0 style attributes
    // TODO Remove these in Tika 2.0
    ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
    ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
    ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
    ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
    ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
    ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
    ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
    // Legacy Statistics Attributes, replaced with real keys above
    // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
    ch = getStatistic(ch, md, "nbPage", "page-count");
    ch = getStatistic(ch, md, "nbPara", "paragraph-count");
    ch = getStatistic(ch, md, "nbWord", "word-count");
    ch = getStatistic(ch, md, "nbCharacter", "character-count");
    ch = getStatistic(ch, md, "nbTab", "table-count");
    ch = getStatistic(ch, md, "nbObject", "object-count");
    ch = getStatistic(ch, md, "nbImg", "image-count");
    // Normalise the rest
    ch = new NSNormalizerContentHandler(ch);
    return ch;
}
Also used : ElementMetadataHandler(org.apache.tika.parser.xml.ElementMetadataHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler)

Example 2 with ElementMetadataHandler

use of org.apache.tika.parser.xml.ElementMetadataHandler in project tika by apache.

the class PrescriptionParser method getContentHandler.

@Override
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
    String xpd = "http://example.com/2011/xpd";
    ContentHandler doctor = new ElementMetadataHandler(xpd, "doctor", metadata, "xpd:doctor");
    ContentHandler patient = new ElementMetadataHandler(xpd, "patient", metadata, "xpd:patient");
    return new TeeContentHandler(super.getContentHandler(handler, metadata, context), doctor, patient);
}
Also used : ElementMetadataHandler(org.apache.tika.parser.xml.ElementMetadataHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Aggregations

ElementMetadataHandler (org.apache.tika.parser.xml.ElementMetadataHandler)2 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)2 ContentHandler (org.xml.sax.ContentHandler)1