Search in sources :

Example 1 with Syntax

use of org.ontoware.rdf2go.model.Syntax in project stanbol by apache.

the class HtmlExtractionRegistry method initialize.

public void initialize(InputStream configFileStream) throws InitializationException {
    try {
        XPathFactory factory = XPathFactory.newInstance();
        XPath xPath = factory.newXPath();
        DocumentBuilder parser = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        Document document = parser.parse(new InputSource(configFileStream));
        Node node;
        NodeList nodes = (NodeList) xPath.evaluate("/htmlextractors/extractor", document, XPathConstants.NODESET);
        if (nodes != null) {
            TransformerFactory transFac = TransformerFactory.newInstance();
            transFac.setURIResolver(new BundleURIResolver());
            for (int j = 0, iCnt = nodes.getLength(); j < iCnt; j++) {
                Node nd = nodes.item(j);
                node = (Node) xPath.evaluate("@id", nd, XPathConstants.NODE);
                String id = node.getNodeValue();
                Node srcNode = (Node) xPath.evaluate("source", nd, XPathConstants.NODE);
                if (srcNode != null) {
                    node = (Node) xPath.evaluate("@type", srcNode, XPathConstants.NODE);
                    String srcType = node.getNodeValue();
                    if (srcType.equals("xslt")) {
                        String rdfFormat = "rdfxml";
                        Syntax rdfSyntax = Syntax.RdfXml;
                        node = (Node) xPath.evaluate("@syntax", srcNode, XPathConstants.NODE);
                        if (node != null) {
                            rdfFormat = node.getNodeValue();
                            if (rdfFormat.equalsIgnoreCase("turtle")) {
                                rdfSyntax = Syntax.Turtle;
                            } else if (rdfFormat.equalsIgnoreCase("ntriple")) {
                                rdfSyntax = Syntax.Ntriples;
                            } else if (rdfFormat.equalsIgnoreCase("n3")) {
                                rdfSyntax = XsltExtractor.N3;
                            } else if (!rdfFormat.equalsIgnoreCase("rdfxml")) {
                                throw new InitializationException("Unknown RDF Syntax: " + rdfFormat + " for " + id + " extractor");
                            }
                        }
                        // TODO: do something about disjunctions of
                        // Extractors? Assume, only RDFa or Microformats are
                        // used?
                        String fileName = DOMUtils.getText(srcNode);
                        XsltExtractor xsltExtractor = new XsltExtractor(id, fileName, transFac);
                        xsltExtractor.setSyntax(rdfSyntax);
                        // name of URI/URL parameter of the script (default
                        // "uri")
                        node = (Node) xPath.evaluate("@uri", srcNode, XPathConstants.NODE);
                        if (node != null) {
                            xsltExtractor.setUriParameter(node.getNodeValue());
                        }
                        registry.put(id, xsltExtractor);
                        activeExtractors.add(id);
                    } else if (srcType.equals("java")) {
                        String clsName = srcNode.getNodeValue();
                        Object extractor = Class.forName(clsName).newInstance();
                        if (extractor instanceof HtmlExtractionComponent) {
                            registry.put(id, (HtmlExtractionComponent) extractor);
                            activeExtractors.add(id);
                        } else {
                            throw new InitializationException("clsName is not an HtmlExtractionComponent");
                        }
                    } else {
                        LOG.warn("No valid type for extractor found: " + id);
                    }
                    LOG.info("Extractor for: " + id);
                }
            }
        }
    } catch (FileNotFoundException e) {
        throw new InitializationException(e.getMessage(), e);
    } catch (XPathExpressionException e) {
        throw new InitializationException(e.getMessage(), e);
    } catch (DOMException e) {
        throw new InitializationException(e.getMessage(), e);
    } catch (ParserConfigurationException e) {
        throw new InitializationException(e.getMessage(), e);
    } catch (SAXException e) {
        throw new InitializationException(e.getMessage(), e);
    } catch (IOException e) {
        throw new InitializationException(e.getMessage(), e);
    } catch (ClassNotFoundException e) {
        throw new InitializationException(e.getMessage(), e);
    } catch (InstantiationException e) {
        throw new InitializationException(e.getMessage(), e);
    } catch (IllegalAccessException e) {
        throw new InitializationException(e.getMessage(), e);
    }
}
Also used : InputSource(org.xml.sax.InputSource) XPathExpressionException(javax.xml.xpath.XPathExpressionException) Node(org.w3c.dom.Node) FileNotFoundException(java.io.FileNotFoundException) Document(org.w3c.dom.Document) SAXException(org.xml.sax.SAXException) XPathFactory(javax.xml.xpath.XPathFactory) DOMException(org.w3c.dom.DOMException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) XPath(javax.xml.xpath.XPath) TransformerFactory(javax.xml.transform.TransformerFactory) NodeList(org.w3c.dom.NodeList) IOException(java.io.IOException) DocumentBuilder(javax.xml.parsers.DocumentBuilder) Syntax(org.ontoware.rdf2go.model.Syntax)

Aggregations

FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 DocumentBuilder (javax.xml.parsers.DocumentBuilder)1 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)1 TransformerFactory (javax.xml.transform.TransformerFactory)1 XPath (javax.xml.xpath.XPath)1 XPathExpressionException (javax.xml.xpath.XPathExpressionException)1 XPathFactory (javax.xml.xpath.XPathFactory)1 Syntax (org.ontoware.rdf2go.model.Syntax)1 DOMException (org.w3c.dom.DOMException)1 Document (org.w3c.dom.Document)1 Node (org.w3c.dom.Node)1 NodeList (org.w3c.dom.NodeList)1 InputSource (org.xml.sax.InputSource)1 SAXException (org.xml.sax.SAXException)1