Search in sources :

Example 51 with DocumentFragment

use of org.w3c.dom.DocumentFragment in project nutch by apache.

the class TikaParser method getParse.

@SuppressWarnings("deprecation")
public ParseResult getParse(Content content) {
    String mimeType = content.getContentType();
    boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
    String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
    URL base;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
    byte[] raw = content.getContent();
    if (parser == null) {
        String message = "Can't retrieve Tika parser for mime-type " + mimeType;
        LOG.error(message);
        return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(content.getUrl(), getConf());
    }
    LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType);
    Metadata tikamd = new Metadata();
    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment root = doc.createDocumentFragment();
    ContentHandler domHandler;
    // Check whether to use Tika's BoilerplateContentHandler
    if (useBoilerpipe) {
        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler) new DOMBuilder(doc, root), BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
        bpHandler.setIncludeMarkup(true);
        domHandler = (ContentHandler) bpHandler;
    } else {
        DOMBuilder domBuilder = new DOMBuilder(doc, root);
        domBuilder.setUpperCaseElementNames(upperCaseElementNames);
        domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
        domHandler = (ContentHandler) domBuilder;
    }
    LinkContentHandler linkContentHandler = new LinkContentHandler();
    ParseContext context = new ParseContext();
    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
    if (HTMLMapper != null)
        context.set(HtmlMapper.class, HTMLMapper);
    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
    try {
        parser.parse(new ByteArrayInputStream(raw), (ContentHandler) teeContentHandler, tikamd, context);
    } catch (Exception e) {
        LOG.error("Error parsing " + content.getUrl(), e);
        return new ParseStatus(ParseStatus.FAILED, e.getMessage()).getEmptyParseResult(content.getUrl(), getConf());
    }
    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();
    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) {
        // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        // extract text
        utils.getText(sb, root);
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        // extract title
        utils.getTitle(sb, root);
        title = sb.toString().trim();
    }
    if (!metaTags.getNoFollow()) {
        // okay to follow links
        // extract outlinks
        ArrayList<Outlink> l = new ArrayList<Outlink>();
        URL baseTag = base;
        String baseTagHref = tikamd.get("Content-Location");
        if (baseTagHref != null) {
            try {
                baseTag = new URL(base, baseTagHref);
            } catch (MalformedURLException e) {
                LOG.trace("Invalid <base href=\"{}\">", baseTagHref);
            }
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links (base URL = {}) ...", baseTag);
        }
        // pre-1233 outlink extraction
        // utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
        // Get outlinks from Tika
        List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
        utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
        }
    }
    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
        if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
            continue;
        String[] values = tikamd.getValues(tikaMDName);
        for (String v : values) nutchMetadata.add(tikaMDName, v);
    }
    if (outlinks.length == 0) {
        outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) {
        // not okay to cache
        for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
}
Also used : MalformedURLException(java.net.MalformedURLException) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.tika.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ParseStatus(org.apache.nutch.parse.ParseStatus) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) DocumentFragment(org.w3c.dom.DocumentFragment) HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) MalformedURLException(java.net.MalformedURLException) Parser(org.apache.tika.parser.Parser) HTMLDocumentImpl(org.apache.html.dom.HTMLDocumentImpl) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) HtmlMapper(org.apache.tika.parser.html.HtmlMapper) ParseContext(org.apache.tika.parser.ParseContext) ParseImpl(org.apache.nutch.parse.ParseImpl) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) Map(java.util.Map) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) Link(org.apache.tika.sax.Link)

Example 52 with DocumentFragment

use of org.w3c.dom.DocumentFragment in project cxf by apache.

the class StaxSerializer method appendNewChild.

private Node appendNewChild(XMLStreamReader reader, boolean wrapped, Document contextDocument, XMLStreamWriter writer, Element element) throws XMLStreamException {
    StaxUtils.copy(reader, writer);
    DocumentFragment result = contextDocument.createDocumentFragment();
    Node child = element.getFirstChild();
    if (wrapped) {
        child = child.getFirstChild();
    }
    if (child != null && child.getNextSibling() == null) {
        return child;
    }
    while (child != null) {
        Node nextChild = child.getNextSibling();
        result.appendChild(child);
        child = nextChild;
    }
    return result;
}
Also used : Node(org.w3c.dom.Node) DocumentFragment(org.w3c.dom.DocumentFragment)

Example 53 with DocumentFragment

use of org.w3c.dom.DocumentFragment in project cxf by apache.

the class AbstractBindingBuilder method cloneElement.

protected Element cloneElement(Element el) {
    Document doc = secHeader.getSecurityHeaderElement().getOwnerDocument();
    if (!doc.equals(el.getOwnerDocument())) {
        XMLStreamReader reader = StaxUtils.createXMLStreamReader(el);
        DocumentFragment fragment = doc.createDocumentFragment();
        W3CDOMStreamWriter writer = new W3CDOMStreamWriter(fragment);
        try {
            StaxUtils.copy(reader, writer);
            return (Element) fragment.getFirstChild();
        } catch (XMLStreamException ex) {
            LOG.log(Level.FINE, "Error cloning security element", ex);
        }
    }
    return el;
}
Also used : W3CDOMStreamWriter(org.apache.cxf.staxutils.W3CDOMStreamWriter) XMLStreamReader(javax.xml.stream.XMLStreamReader) XMLStreamException(javax.xml.stream.XMLStreamException) Element(org.w3c.dom.Element) Document(org.w3c.dom.Document) DocumentFragment(org.w3c.dom.DocumentFragment)

Example 54 with DocumentFragment

use of org.w3c.dom.DocumentFragment in project cxf by apache.

the class IssueJWTClaimsUnitTest method testIssueJWTTokenOnBehalfOfSaml2DifferentRealmFederateClaims.

/**
 * Test to successfully issue a JWT token (realm "B") on-behalf-of a SAML 2 token
 * which was issued by realm "A".
 * The relationship type between realm A and B is: FederateClaims
 */
@org.junit.Test
public void testIssueJWTTokenOnBehalfOfSaml2DifferentRealmFederateClaims() throws Exception {
    TokenIssueOperation issueOperation = new TokenIssueOperation();
    Map<String, RealmProperties> realms = createSamlRealms();
    // Add Token Provider
    List<TokenProvider> providerList = new ArrayList<>();
    JWTTokenProvider tokenProvider = new JWTTokenProvider();
    tokenProvider.setRealmMap(realms);
    providerList.add(tokenProvider);
    issueOperation.setTokenProviders(providerList);
    TokenDelegationHandler delegationHandler = new SAMLDelegationHandler();
    issueOperation.setDelegationHandlers(Collections.singletonList(delegationHandler));
    // Add Token Validator
    List<TokenValidator> validatorList = new ArrayList<>();
    SAMLTokenValidator samlTokenValidator = new SAMLTokenValidator();
    samlTokenValidator.setSamlRealmCodec(new IssuerSAMLRealmCodec());
    validatorList.add(samlTokenValidator);
    issueOperation.setTokenValidators(validatorList);
    addService(issueOperation);
    // Add Relationship list
    List<Relationship> relationshipList = new ArrayList<>();
    Relationship rs = createRelationship();
    relationshipList.add(rs);
    // Add STSProperties object
    Crypto crypto = CryptoFactory.getInstance(getEncryptionProperties());
    STSPropertiesMBean stsProperties = createSTSPropertiesMBean(crypto);
    stsProperties.setRealmParser(new CustomRealmParser());
    stsProperties.setIdentityMapper(new CustomIdentityMapper());
    stsProperties.setRelationships(relationshipList);
    issueOperation.setStsProperties(stsProperties);
    // Set the ClaimsManager
    ClaimsManager claimsManager = new ClaimsManager();
    ClaimsHandler claimsHandler = new CustomClaimsHandler();
    claimsManager.setClaimHandlers(Collections.singletonList(claimsHandler));
    issueOperation.setClaimsManager(claimsManager);
    // Mock up a request
    RequestSecurityTokenType request = new RequestSecurityTokenType();
    JAXBElement<String> tokenType = new JAXBElement<String>(QNameConstants.TOKEN_TYPE, String.class, JWTTokenProvider.JWT_TOKEN_TYPE);
    request.getAny().add(tokenType);
    // Add a ClaimsType
    ClaimsType claimsType = new ClaimsType();
    claimsType.setDialect(STSConstants.IDT_NS_05_05);
    Document doc = DOMUtils.getEmptyDocument();
    Element claimType = createClaimsType(doc);
    claimsType.getAny().add(claimType);
    JAXBElement<ClaimsType> claimsTypeJaxb = new JAXBElement<ClaimsType>(QNameConstants.CLAIMS, ClaimsType.class, claimsType);
    request.getAny().add(claimsTypeJaxb);
    // request.getAny().add(createAppliesToElement("http://dummy-service.com/dummy"));
    // create a SAML Token via the SAMLTokenProvider which contains claims
    CallbackHandler callbackHandler = new PasswordCallbackHandler();
    Element samlToken = createSAMLAssertion(WSS4JConstants.WSS_SAML2_TOKEN_TYPE, crypto, "mystskey", callbackHandler, realms);
    DocumentFragment f = samlToken.getOwnerDocument().createDocumentFragment();
    f.appendChild(samlToken);
    Document docToken = samlToken.getOwnerDocument();
    samlToken = (Element) docToken.appendChild(samlToken);
    String samlString = DOM2Writer.nodeToString(samlToken);
    assertTrue(samlString.contains("AttributeStatement"));
    assertTrue(samlString.contains("alice"));
    assertTrue(samlString.contains("doe"));
    assertTrue(samlString.contains(SAML2Constants.CONF_BEARER));
    // add SAML token as On-Behalf-Of element
    OnBehalfOfType onbehalfof = new OnBehalfOfType();
    onbehalfof.setAny(samlToken);
    JAXBElement<OnBehalfOfType> onbehalfofType = new JAXBElement<OnBehalfOfType>(QNameConstants.ON_BEHALF_OF, OnBehalfOfType.class, onbehalfof);
    request.getAny().add(onbehalfofType);
    // Mock up message context
    MessageImpl msg = new MessageImpl();
    WrappedMessageContext msgCtx = new WrappedMessageContext(msg);
    msgCtx.put("url", "https");
    List<RequestSecurityTokenResponseType> securityTokenResponseList = issueToken(issueOperation, request, new CustomTokenPrincipal("alice"), msgCtx);
    // Test the generated token.
    Element token = null;
    for (Object tokenObject : securityTokenResponseList.get(0).getAny()) {
        if (tokenObject instanceof JAXBElement<?> && REQUESTED_SECURITY_TOKEN.equals(((JAXBElement<?>) tokenObject).getName())) {
            RequestedSecurityTokenType rstType = (RequestedSecurityTokenType) ((JAXBElement<?>) tokenObject).getValue();
            token = (Element) rstType.getAny();
            break;
        }
    }
    assertNotNull(token);
    // Validate the token
    JwsJwtCompactConsumer jwtConsumer = new JwsJwtCompactConsumer(token.getTextContent());
    JwtToken jwt = jwtConsumer.getJwtToken();
    // subject unchanged
    Assert.assertEquals("alice", jwt.getClaim(JwtConstants.CLAIM_SUBJECT));
    // transformed claim (to uppercase)
    assertEquals(jwt.getClaim(ClaimTypes.LASTNAME.toString()), "DOE");
}
Also used : CallbackHandler(javax.security.auth.callback.CallbackHandler) PasswordCallbackHandler(org.apache.cxf.sts.common.PasswordCallbackHandler) RequestSecurityTokenType(org.apache.cxf.ws.security.sts.provider.model.RequestSecurityTokenType) JAXBElement(javax.xml.bind.JAXBElement) Element(org.w3c.dom.Element) ArrayList(java.util.ArrayList) RequestSecurityTokenResponseType(org.apache.cxf.ws.security.sts.provider.model.RequestSecurityTokenResponseType) RequestedSecurityTokenType(org.apache.cxf.ws.security.sts.provider.model.RequestedSecurityTokenType) Document(org.w3c.dom.Document) CustomClaimsHandler(org.apache.cxf.sts.common.CustomClaimsHandler) CustomTokenPrincipal(org.apache.wss4j.common.principal.CustomTokenPrincipal) TokenProvider(org.apache.cxf.sts.token.provider.TokenProvider) JWTTokenProvider(org.apache.cxf.sts.token.provider.jwt.JWTTokenProvider) SAMLTokenProvider(org.apache.cxf.sts.token.provider.SAMLTokenProvider) TokenValidator(org.apache.cxf.sts.token.validator.TokenValidator) SAMLTokenValidator(org.apache.cxf.sts.token.validator.SAMLTokenValidator) IssuerSAMLRealmCodec(org.apache.cxf.sts.token.validator.IssuerSAMLRealmCodec) ClaimsManager(org.apache.cxf.sts.claims.ClaimsManager) PasswordCallbackHandler(org.apache.cxf.sts.common.PasswordCallbackHandler) JwsJwtCompactConsumer(org.apache.cxf.rs.security.jose.jws.JwsJwtCompactConsumer) TokenDelegationHandler(org.apache.cxf.sts.token.delegation.TokenDelegationHandler) RealmProperties(org.apache.cxf.sts.token.realm.RealmProperties) DocumentFragment(org.w3c.dom.DocumentFragment) JWTTokenProvider(org.apache.cxf.sts.token.provider.jwt.JWTTokenProvider) ClaimsHandler(org.apache.cxf.sts.claims.ClaimsHandler) CustomClaimsHandler(org.apache.cxf.sts.common.CustomClaimsHandler) ClaimsType(org.apache.cxf.ws.security.sts.provider.model.ClaimsType) SAMLDelegationHandler(org.apache.cxf.sts.token.delegation.SAMLDelegationHandler) JAXBElement(javax.xml.bind.JAXBElement) JwtToken(org.apache.cxf.rs.security.jose.jwt.JwtToken) OnBehalfOfType(org.apache.cxf.ws.security.sts.provider.model.OnBehalfOfType) Crypto(org.apache.wss4j.common.crypto.Crypto) STSPropertiesMBean(org.apache.cxf.sts.STSPropertiesMBean) Relationship(org.apache.cxf.sts.token.realm.Relationship) WrappedMessageContext(org.apache.cxf.jaxws.context.WrappedMessageContext) SAMLTokenValidator(org.apache.cxf.sts.token.validator.SAMLTokenValidator) MessageImpl(org.apache.cxf.message.MessageImpl)

Example 55 with DocumentFragment

use of org.w3c.dom.DocumentFragment in project cxf by apache.

the class StaxUtils method writeNode.

public static void writeNode(Node n, XMLStreamWriter writer, boolean repairing) throws XMLStreamException {
    switch(n.getNodeType()) {
        case Node.ELEMENT_NODE:
            writeElement((Element) n, writer, repairing);
            break;
        case Node.TEXT_NODE:
            writer.writeCharacters(((Text) n).getNodeValue());
            break;
        case Node.COMMENT_NODE:
            writer.writeComment(((Comment) n).getData());
            break;
        case Node.CDATA_SECTION_NODE:
            writer.writeCData(((CDATASection) n).getData());
            break;
        case Node.ENTITY_REFERENCE_NODE:
            writer.writeEntityRef(((EntityReference) n).getNodeValue());
            break;
        case Node.PROCESSING_INSTRUCTION_NODE:
            ProcessingInstruction pi = (ProcessingInstruction) n;
            writer.writeProcessingInstruction(pi.getTarget(), pi.getData());
            break;
        case Node.DOCUMENT_NODE:
            writeDocument((Document) n, writer, repairing);
            break;
        case Node.DOCUMENT_FRAGMENT_NODE:
            {
                DocumentFragment frag = (DocumentFragment) n;
                Node child = frag.getFirstChild();
                while (child != null) {
                    writeNode(child, writer, repairing);
                    child = child.getNextSibling();
                }
                break;
            }
        case Node.DOCUMENT_TYPE_NODE:
            try {
                if (((DocumentType) n).getTextContent() != null) {
                    writer.writeDTD(((DocumentType) n).getTextContent());
                }
            } catch (UnsupportedOperationException ex) {
            // can we ignore?  DOM writers really don't allow this
            // as there isn't a way to write a DTD in dom
            }
            break;
        default:
            throw new IllegalStateException("Found type: " + n.getClass().getName());
    }
}
Also used : Node(org.w3c.dom.Node) ProcessingInstruction(org.w3c.dom.ProcessingInstruction) DocumentFragment(org.w3c.dom.DocumentFragment)

Aggregations

DocumentFragment (org.w3c.dom.DocumentFragment)57 Document (org.w3c.dom.Document)27 Element (org.w3c.dom.Element)24 Node (org.w3c.dom.Node)20 NodeList (org.w3c.dom.NodeList)17 JAXBElement (javax.xml.bind.JAXBElement)8 Marshaller (javax.xml.bind.Marshaller)6 IOException (java.io.IOException)5 ArrayList (java.util.ArrayList)5 XMLStreamException (javax.xml.stream.XMLStreamException)5 DOMException (org.w3c.dom.DOMException)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 JAXBException (javax.xml.bind.JAXBException)4 DOMFragmentParser (org.cyberneko.html.parsers.DOMFragmentParser)4 Text (org.w3c.dom.Text)4 SAXException (org.xml.sax.SAXException)4 MalformedURLException (java.net.MalformedURLException)3 LinkedHashMap (java.util.LinkedHashMap)3 DocumentBuilder (javax.xml.parsers.DocumentBuilder)3 XMLStreamReader (javax.xml.stream.XMLStreamReader)3