use of org.ontoware.rdf2go.model.Syntax in project stanbol by apache.
the class HtmlExtractionRegistry method initialize.
public void initialize(InputStream configFileStream) throws InitializationException {
try {
XPathFactory factory = XPathFactory.newInstance();
XPath xPath = factory.newXPath();
DocumentBuilder parser = DocumentBuilderFactory.newInstance().newDocumentBuilder();
Document document = parser.parse(new InputSource(configFileStream));
Node node;
NodeList nodes = (NodeList) xPath.evaluate("/htmlextractors/extractor", document, XPathConstants.NODESET);
if (nodes != null) {
TransformerFactory transFac = TransformerFactory.newInstance();
transFac.setURIResolver(new BundleURIResolver());
for (int j = 0, iCnt = nodes.getLength(); j < iCnt; j++) {
Node nd = nodes.item(j);
node = (Node) xPath.evaluate("@id", nd, XPathConstants.NODE);
String id = node.getNodeValue();
Node srcNode = (Node) xPath.evaluate("source", nd, XPathConstants.NODE);
if (srcNode != null) {
node = (Node) xPath.evaluate("@type", srcNode, XPathConstants.NODE);
String srcType = node.getNodeValue();
if (srcType.equals("xslt")) {
String rdfFormat = "rdfxml";
Syntax rdfSyntax = Syntax.RdfXml;
node = (Node) xPath.evaluate("@syntax", srcNode, XPathConstants.NODE);
if (node != null) {
rdfFormat = node.getNodeValue();
if (rdfFormat.equalsIgnoreCase("turtle")) {
rdfSyntax = Syntax.Turtle;
} else if (rdfFormat.equalsIgnoreCase("ntriple")) {
rdfSyntax = Syntax.Ntriples;
} else if (rdfFormat.equalsIgnoreCase("n3")) {
rdfSyntax = XsltExtractor.N3;
} else if (!rdfFormat.equalsIgnoreCase("rdfxml")) {
throw new InitializationException("Unknown RDF Syntax: " + rdfFormat + " for " + id + " extractor");
}
}
// TODO: do something about disjunctions of
// Extractors? Assume, only RDFa or Microformats are
// used?
String fileName = DOMUtils.getText(srcNode);
XsltExtractor xsltExtractor = new XsltExtractor(id, fileName, transFac);
xsltExtractor.setSyntax(rdfSyntax);
// name of URI/URL parameter of the script (default
// "uri")
node = (Node) xPath.evaluate("@uri", srcNode, XPathConstants.NODE);
if (node != null) {
xsltExtractor.setUriParameter(node.getNodeValue());
}
registry.put(id, xsltExtractor);
activeExtractors.add(id);
} else if (srcType.equals("java")) {
String clsName = srcNode.getNodeValue();
Object extractor = Class.forName(clsName).newInstance();
if (extractor instanceof HtmlExtractionComponent) {
registry.put(id, (HtmlExtractionComponent) extractor);
activeExtractors.add(id);
} else {
throw new InitializationException("clsName is not an HtmlExtractionComponent");
}
} else {
LOG.warn("No valid type for extractor found: " + id);
}
LOG.info("Extractor for: " + id);
}
}
}
} catch (FileNotFoundException e) {
throw new InitializationException(e.getMessage(), e);
} catch (XPathExpressionException e) {
throw new InitializationException(e.getMessage(), e);
} catch (DOMException e) {
throw new InitializationException(e.getMessage(), e);
} catch (ParserConfigurationException e) {
throw new InitializationException(e.getMessage(), e);
} catch (SAXException e) {
throw new InitializationException(e.getMessage(), e);
} catch (IOException e) {
throw new InitializationException(e.getMessage(), e);
} catch (ClassNotFoundException e) {
throw new InitializationException(e.getMessage(), e);
} catch (InstantiationException e) {
throw new InitializationException(e.getMessage(), e);
} catch (IllegalAccessException e) {
throw new InitializationException(e.getMessage(), e);
}
}
Aggregations