use of org.w3c.dom.DocumentFragment in project nutch by apache.
the class TikaParser method getParse.
@SuppressWarnings("deprecation")
public ParseResult getParse(Content content) {
String mimeType = content.getContentType();
boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
URL base;
try {
base = new URL(content.getBaseUrl());
} catch (MalformedURLException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
// get the right parser using the mime type as a clue
Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
byte[] raw = content.getContent();
if (parser == null) {
String message = "Can't retrieve Tika parser for mime-type " + mimeType;
LOG.error(message);
return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(content.getUrl(), getConf());
}
LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType);
Metadata tikamd = new Metadata();
HTMLDocumentImpl doc = new HTMLDocumentImpl();
doc.setErrorChecking(false);
DocumentFragment root = doc.createDocumentFragment();
ContentHandler domHandler;
// Check whether to use Tika's BoilerplateContentHandler
if (useBoilerpipe) {
BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler) new DOMBuilder(doc, root), BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
bpHandler.setIncludeMarkup(true);
domHandler = (ContentHandler) bpHandler;
} else {
DOMBuilder domBuilder = new DOMBuilder(doc, root);
domBuilder.setUpperCaseElementNames(upperCaseElementNames);
domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
domHandler = (ContentHandler) domBuilder;
}
LinkContentHandler linkContentHandler = new LinkContentHandler();
ParseContext context = new ParseContext();
TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
if (HTMLMapper != null)
context.set(HtmlMapper.class, HTMLMapper);
tikamd.set(Metadata.CONTENT_TYPE, mimeType);
try {
parser.parse(new ByteArrayInputStream(raw), (ContentHandler) teeContentHandler, tikamd, context);
} catch (Exception e) {
LOG.error("Error parsing " + content.getUrl(), e);
return new ParseStatus(ParseStatus.FAILED, e.getMessage()).getEmptyParseResult(content.getUrl(), getConf());
}
HTMLMetaTags metaTags = new HTMLMetaTags();
String text = "";
String title = "";
Outlink[] outlinks = new Outlink[0];
org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();
// we have converted the sax events generated by Tika into a DOM object
// so we can now use the usual HTML resources from Nutch
// get meta directives
HTMLMetaProcessor.getMetaTags(metaTags, root, base);
if (LOG.isTraceEnabled()) {
LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
}
// check meta directives
if (!metaTags.getNoIndex()) {
// okay to index
StringBuffer sb = new StringBuffer();
if (LOG.isTraceEnabled()) {
LOG.trace("Getting text...");
}
// extract text
utils.getText(sb, root);
text = sb.toString();
sb.setLength(0);
if (LOG.isTraceEnabled()) {
LOG.trace("Getting title...");
}
// extract title
utils.getTitle(sb, root);
title = sb.toString().trim();
}
if (!metaTags.getNoFollow()) {
// okay to follow links
// extract outlinks
ArrayList<Outlink> l = new ArrayList<Outlink>();
URL baseTag = base;
String baseTagHref = tikamd.get("Content-Location");
if (baseTagHref != null) {
try {
baseTag = new URL(base, baseTagHref);
} catch (MalformedURLException e) {
LOG.trace("Invalid <base href=\"{}\">", baseTagHref);
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("Getting links (base URL = {}) ...", baseTag);
}
// pre-1233 outlink extraction
// utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
// Get outlinks from Tika
List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
outlinks = l.toArray(new Outlink[l.size()]);
if (LOG.isTraceEnabled()) {
LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
}
}
// populate Nutch metadata with Tika metadata
String[] TikaMDNames = tikamd.names();
for (String tikaMDName : TikaMDNames) {
if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
continue;
String[] values = tikamd.getValues(tikaMDName);
for (String v : values) nutchMetadata.add(tikaMDName, v);
}
if (outlinks.length == 0) {
outlinks = OutlinkExtractor.getOutlinks(text, getConf());
}
ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
if (metaTags.getRefresh()) {
status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
}
ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// run filters on parse
ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
if (metaTags.getNoCache()) {
// not okay to cache
for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
}
return filteredParse;
}
use of org.w3c.dom.DocumentFragment in project cxf by apache.
the class StaxSerializer method appendNewChild.
private Node appendNewChild(XMLStreamReader reader, boolean wrapped, Document contextDocument, XMLStreamWriter writer, Element element) throws XMLStreamException {
StaxUtils.copy(reader, writer);
DocumentFragment result = contextDocument.createDocumentFragment();
Node child = element.getFirstChild();
if (wrapped) {
child = child.getFirstChild();
}
if (child != null && child.getNextSibling() == null) {
return child;
}
while (child != null) {
Node nextChild = child.getNextSibling();
result.appendChild(child);
child = nextChild;
}
return result;
}
use of org.w3c.dom.DocumentFragment in project cxf by apache.
the class AbstractBindingBuilder method cloneElement.
protected Element cloneElement(Element el) {
Document doc = secHeader.getSecurityHeaderElement().getOwnerDocument();
if (!doc.equals(el.getOwnerDocument())) {
XMLStreamReader reader = StaxUtils.createXMLStreamReader(el);
DocumentFragment fragment = doc.createDocumentFragment();
W3CDOMStreamWriter writer = new W3CDOMStreamWriter(fragment);
try {
StaxUtils.copy(reader, writer);
return (Element) fragment.getFirstChild();
} catch (XMLStreamException ex) {
LOG.log(Level.FINE, "Error cloning security element", ex);
}
}
return el;
}
use of org.w3c.dom.DocumentFragment in project cxf by apache.
the class IssueJWTClaimsUnitTest method testIssueJWTTokenOnBehalfOfSaml2DifferentRealmFederateClaims.
/**
* Test to successfully issue a JWT token (realm "B") on-behalf-of a SAML 2 token
* which was issued by realm "A".
* The relationship type between realm A and B is: FederateClaims
*/
@org.junit.Test
public void testIssueJWTTokenOnBehalfOfSaml2DifferentRealmFederateClaims() throws Exception {
TokenIssueOperation issueOperation = new TokenIssueOperation();
Map<String, RealmProperties> realms = createSamlRealms();
// Add Token Provider
List<TokenProvider> providerList = new ArrayList<>();
JWTTokenProvider tokenProvider = new JWTTokenProvider();
tokenProvider.setRealmMap(realms);
providerList.add(tokenProvider);
issueOperation.setTokenProviders(providerList);
TokenDelegationHandler delegationHandler = new SAMLDelegationHandler();
issueOperation.setDelegationHandlers(Collections.singletonList(delegationHandler));
// Add Token Validator
List<TokenValidator> validatorList = new ArrayList<>();
SAMLTokenValidator samlTokenValidator = new SAMLTokenValidator();
samlTokenValidator.setSamlRealmCodec(new IssuerSAMLRealmCodec());
validatorList.add(samlTokenValidator);
issueOperation.setTokenValidators(validatorList);
addService(issueOperation);
// Add Relationship list
List<Relationship> relationshipList = new ArrayList<>();
Relationship rs = createRelationship();
relationshipList.add(rs);
// Add STSProperties object
Crypto crypto = CryptoFactory.getInstance(getEncryptionProperties());
STSPropertiesMBean stsProperties = createSTSPropertiesMBean(crypto);
stsProperties.setRealmParser(new CustomRealmParser());
stsProperties.setIdentityMapper(new CustomIdentityMapper());
stsProperties.setRelationships(relationshipList);
issueOperation.setStsProperties(stsProperties);
// Set the ClaimsManager
ClaimsManager claimsManager = new ClaimsManager();
ClaimsHandler claimsHandler = new CustomClaimsHandler();
claimsManager.setClaimHandlers(Collections.singletonList(claimsHandler));
issueOperation.setClaimsManager(claimsManager);
// Mock up a request
RequestSecurityTokenType request = new RequestSecurityTokenType();
JAXBElement<String> tokenType = new JAXBElement<String>(QNameConstants.TOKEN_TYPE, String.class, JWTTokenProvider.JWT_TOKEN_TYPE);
request.getAny().add(tokenType);
// Add a ClaimsType
ClaimsType claimsType = new ClaimsType();
claimsType.setDialect(STSConstants.IDT_NS_05_05);
Document doc = DOMUtils.getEmptyDocument();
Element claimType = createClaimsType(doc);
claimsType.getAny().add(claimType);
JAXBElement<ClaimsType> claimsTypeJaxb = new JAXBElement<ClaimsType>(QNameConstants.CLAIMS, ClaimsType.class, claimsType);
request.getAny().add(claimsTypeJaxb);
// request.getAny().add(createAppliesToElement("http://dummy-service.com/dummy"));
// create a SAML Token via the SAMLTokenProvider which contains claims
CallbackHandler callbackHandler = new PasswordCallbackHandler();
Element samlToken = createSAMLAssertion(WSS4JConstants.WSS_SAML2_TOKEN_TYPE, crypto, "mystskey", callbackHandler, realms);
DocumentFragment f = samlToken.getOwnerDocument().createDocumentFragment();
f.appendChild(samlToken);
Document docToken = samlToken.getOwnerDocument();
samlToken = (Element) docToken.appendChild(samlToken);
String samlString = DOM2Writer.nodeToString(samlToken);
assertTrue(samlString.contains("AttributeStatement"));
assertTrue(samlString.contains("alice"));
assertTrue(samlString.contains("doe"));
assertTrue(samlString.contains(SAML2Constants.CONF_BEARER));
// add SAML token as On-Behalf-Of element
OnBehalfOfType onbehalfof = new OnBehalfOfType();
onbehalfof.setAny(samlToken);
JAXBElement<OnBehalfOfType> onbehalfofType = new JAXBElement<OnBehalfOfType>(QNameConstants.ON_BEHALF_OF, OnBehalfOfType.class, onbehalfof);
request.getAny().add(onbehalfofType);
// Mock up message context
MessageImpl msg = new MessageImpl();
WrappedMessageContext msgCtx = new WrappedMessageContext(msg);
msgCtx.put("url", "https");
List<RequestSecurityTokenResponseType> securityTokenResponseList = issueToken(issueOperation, request, new CustomTokenPrincipal("alice"), msgCtx);
// Test the generated token.
Element token = null;
for (Object tokenObject : securityTokenResponseList.get(0).getAny()) {
if (tokenObject instanceof JAXBElement<?> && REQUESTED_SECURITY_TOKEN.equals(((JAXBElement<?>) tokenObject).getName())) {
RequestedSecurityTokenType rstType = (RequestedSecurityTokenType) ((JAXBElement<?>) tokenObject).getValue();
token = (Element) rstType.getAny();
break;
}
}
assertNotNull(token);
// Validate the token
JwsJwtCompactConsumer jwtConsumer = new JwsJwtCompactConsumer(token.getTextContent());
JwtToken jwt = jwtConsumer.getJwtToken();
// subject unchanged
Assert.assertEquals("alice", jwt.getClaim(JwtConstants.CLAIM_SUBJECT));
// transformed claim (to uppercase)
assertEquals(jwt.getClaim(ClaimTypes.LASTNAME.toString()), "DOE");
}
use of org.w3c.dom.DocumentFragment in project cxf by apache.
the class StaxUtils method writeNode.
public static void writeNode(Node n, XMLStreamWriter writer, boolean repairing) throws XMLStreamException {
switch(n.getNodeType()) {
case Node.ELEMENT_NODE:
writeElement((Element) n, writer, repairing);
break;
case Node.TEXT_NODE:
writer.writeCharacters(((Text) n).getNodeValue());
break;
case Node.COMMENT_NODE:
writer.writeComment(((Comment) n).getData());
break;
case Node.CDATA_SECTION_NODE:
writer.writeCData(((CDATASection) n).getData());
break;
case Node.ENTITY_REFERENCE_NODE:
writer.writeEntityRef(((EntityReference) n).getNodeValue());
break;
case Node.PROCESSING_INSTRUCTION_NODE:
ProcessingInstruction pi = (ProcessingInstruction) n;
writer.writeProcessingInstruction(pi.getTarget(), pi.getData());
break;
case Node.DOCUMENT_NODE:
writeDocument((Document) n, writer, repairing);
break;
case Node.DOCUMENT_FRAGMENT_NODE:
{
DocumentFragment frag = (DocumentFragment) n;
Node child = frag.getFirstChild();
while (child != null) {
writeNode(child, writer, repairing);
child = child.getNextSibling();
}
break;
}
case Node.DOCUMENT_TYPE_NODE:
try {
if (((DocumentType) n).getTextContent() != null) {
writer.writeDTD(((DocumentType) n).getTextContent());
}
} catch (UnsupportedOperationException ex) {
// can we ignore? DOM writers really don't allow this
// as there isn't a way to write a DTD in dom
}
break;
default:
throw new IllegalStateException("Found type: " + n.getClass().getName());
}
}
Aggregations