Search in sources :

Example 6 with OutputFormat

use of org.apache.xml.serialize.OutputFormat in project lucene-solr by apache.

the class ExtractingDocumentLoader method load.

@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
        //Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
        MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
        parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
        parser = autoDetectParser;
    }
    if (parser != null) {
        Metadata metadata = new Metadata();
        // If you specify the resource name (the filename, roughly) with this parameter,
        // then Tika can make use of it in guessing the appropriate MIME type:
        String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
        if (resourceName != null) {
            metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
        }
        // Provide stream's content type as hint for auto detection
        if (stream.getContentType() != null) {
            metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
        }
        InputStream inputStream = null;
        try {
            inputStream = stream.getStream();
            metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
            metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
            metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
            metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
            // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
            String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
            if (charset != null) {
                metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
            }
            String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
            boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
            SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
            ContentHandler parsingHandler = handler;
            StringWriter writer = null;
            BaseMarkupSerializer serializer = null;
            if (extractOnly == true) {
                String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
                writer = new StringWriter();
                if (extractFormat.equals(TEXT_FORMAT)) {
                    serializer = new TextSerializer();
                    serializer.setOutputCharStream(writer);
                    serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
                } else {
                    serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
                }
                if (xpathExpr != null) {
                    Matcher matcher = PARSER.parse(xpathExpr);
                    //The MatchingContentHandler does not invoke startDocument.  See http://tika.markmail.org/message/kknu3hw7argwiqin
                    serializer.startDocument();
                    parsingHandler = new MatchingContentHandler(serializer, matcher);
                } else {
                    parsingHandler = serializer;
                }
            } else if (xpathExpr != null) {
                Matcher matcher = PARSER.parse(xpathExpr);
                parsingHandler = new MatchingContentHandler(handler, matcher);
            }
            try {
                //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
                ParseContext context = parseContextConfig.create();
                context.set(Parser.class, parser);
                context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
                // Password handling
                RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
                String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
                if (pwMapFile != null && pwMapFile.length() > 0) {
                    InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
                    if (is != null) {
                        log.debug("Password file supplied: " + pwMapFile);
                        epp.parse(is);
                    }
                }
                context.set(PasswordProvider.class, epp);
                String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
                if (resourcePassword != null) {
                    epp.setExplicitPassword(resourcePassword);
                    log.debug("Literal password supplied for file " + resourceName);
                }
                parser.parse(inputStream, parsingHandler, metadata, context);
            } catch (TikaException e) {
                if (ignoreTikaException)
                    log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage()).append(". metadata=").append(metadata.toString()).toString());
                else
                    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
            }
            if (extractOnly == false) {
                addDoc(handler);
            } else {
                //serializer is not null, so we need to call endDoc on it if using xpath
                if (xpathExpr != null) {
                    serializer.endDocument();
                }
                rsp.add(stream.getName(), writer.toString());
                writer.close();
                String[] names = metadata.names();
                NamedList metadataNL = new NamedList();
                for (int i = 0; i < names.length; i++) {
                    String[] vals = metadata.getValues(names[i]);
                    metadataNL.add(names[i], vals);
                }
                rsp.add(stream.getName() + "_metadata", metadataNL);
            }
        } catch (SAXException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        } finally {
            IOUtils.closeQuietly(inputStream);
        }
    } else {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers.  Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
    }
}
Also used : Matcher(org.apache.tika.sax.xpath.Matcher) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) Metadata(org.apache.tika.metadata.Metadata) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) SAXException(org.xml.sax.SAXException) StringWriter(java.io.StringWriter) MediaType(org.apache.tika.mime.MediaType) SolrException(org.apache.solr.common.SolrException) DefaultParser(org.apache.tika.parser.DefaultParser) XMLSerializer(org.apache.xml.serialize.XMLSerializer) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) NamedList(org.apache.solr.common.util.NamedList) BaseMarkupSerializer(org.apache.xml.serialize.BaseMarkupSerializer) OutputFormat(org.apache.xml.serialize.OutputFormat) Parser(org.apache.tika.parser.Parser) XPathParser(org.apache.tika.sax.xpath.XPathParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) TextSerializer(org.apache.xml.serialize.TextSerializer) ParseContext(org.apache.tika.parser.ParseContext)

Example 7 with OutputFormat

use of org.apache.xml.serialize.OutputFormat in project sirix by sirixdb.

the class ExtractArticles method main.

/**
 * Main method.
 *
 * @param pArgs
 *          First param specifies the Wikipedia dump to parse.
 */
public static void main(final String[] pArgs) {
    if (pArgs.length != 1) {
        new IllegalStateException("First parameter must be the wikipedia dump!");
    }
    start = System.nanoTime();
    System.out.print("Start extracting articles... ");
    final String wikiDump = new File(pArgs[0]).getAbsolutePath();
    final XMLReader parser = new ExtractArticles(new SAXParser());
    if (parser != null) {
        try {
            TARGET.delete();
            TARGET.createNewFile();
            final XMLSerializer printer = new XMLSerializer(new FileWriter(TARGET), new OutputFormat());
            parser.setContentHandler(printer);
            parser.parse(wikiDump);
        } catch (final IOException | SAXException e) {
            LOGWRAPPER.error(e.getMessage(), e);
        }
    }
}
Also used : XMLSerializer(org.apache.xml.serialize.XMLSerializer) FileWriter(java.io.FileWriter) OutputFormat(org.apache.xml.serialize.OutputFormat) SAXParser(org.apache.xerces.parsers.SAXParser) IOException(java.io.IOException) File(java.io.File) XMLReader(org.xml.sax.XMLReader) SAXException(org.xml.sax.SAXException)

Example 8 with OutputFormat

use of org.apache.xml.serialize.OutputFormat in project airavata by apache.

the class XmlFormatter method format.

/**
 * @param unformattedXml
 * @return formattedXml
 */
public static String format(String unformattedXml) {
    try {
        final Document document = parseXmlFile(unformattedXml);
        OutputFormat format = new OutputFormat(document);
        format.setLineWidth(65);
        format.setIndenting(true);
        format.setIndent(2);
        Writer out = new StringWriter();
        XMLSerializer serializer = new XMLSerializer(out, format);
        serializer.serialize(document);
        return out.toString();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : XMLSerializer(org.apache.xml.serialize.XMLSerializer) StringWriter(java.io.StringWriter) OutputFormat(org.apache.xml.serialize.OutputFormat) IOException(java.io.IOException) Document(org.w3c.dom.Document) StringWriter(java.io.StringWriter) Writer(java.io.Writer)

Example 9 with OutputFormat

use of org.apache.xml.serialize.OutputFormat in project ats-framework by Axway.

the class AtsProjectConfiguration method save.

public void save() throws AtsConfigurationException {
    // save the XML file
    try {
        OutputFormat format = new OutputFormat(doc);
        format.setIndenting(true);
        format.setIndent(4);
        format.setLineWidth(1000);
        XMLSerializer serializer = new XMLSerializer(new FileOutputStream(new File(atsConfigurationFile)), format);
        serializer.serialize(doc);
    } catch (Exception e) {
        throw new AtsConfigurationException("Error saving ATS configuration in '" + atsConfigurationFile + "'", e);
    }
}
Also used : XMLSerializer(org.apache.xml.serialize.XMLSerializer) AtsConfigurationException(com.axway.ats.core.atsconfig.exceptions.AtsConfigurationException) FileOutputStream(java.io.FileOutputStream) OutputFormat(org.apache.xml.serialize.OutputFormat) File(java.io.File) AtsConfigurationException(com.axway.ats.core.atsconfig.exceptions.AtsConfigurationException)

Example 10 with OutputFormat

use of org.apache.xml.serialize.OutputFormat in project ats-framework by Axway.

the class LocalFileSystemSnapshot method toFile.

@Override
public void toFile(String backupFile) {
    log.info("SAVE TO FILE " + backupFile + " - START");
    // create the directory if does not exist
    File dirPath = new File(IoUtils.getFilePath(backupFile));
    if (!dirPath.exists()) {
        dirPath.mkdirs();
    }
    Document dom;
    try {
        dom = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
    } catch (Exception e) {
        throw new FileSystemSnapshotException("Error creating DOM parser for " + backupFile, e);
    }
    // TODO - add DTD or schema for manual creation and easy validation
    Element fileSystemNode = dom.createElement(NODE_FILE_SYSTEM);
    fileSystemNode.setAttribute("name", this.name);
    fileSystemNode.setAttribute("time", SnapshotUtils.dateToString(this.snapshotTimestamp));
    dom.appendChild(fileSystemNode);
    for (String dirSnapshotName : this.dirSnapshots.keySet()) {
        Element dirSnapshotNode = dom.createElement(NODE_DIRECTORY);
        fileSystemNode.appendChild(dirSnapshotNode);
        dirSnapshotNode.setAttribute("alias", dirSnapshotName);
        this.dirSnapshots.get(dirSnapshotName).toFile(dom, dirSnapshotNode);
    }
    // save the XML file
    OutputStream fos = null;
    try {
        OutputFormat format = new OutputFormat(dom);
        format.setIndenting(true);
        format.setIndent(4);
        format.setLineWidth(1000);
        fos = new FileOutputStream(new File(backupFile));
        XMLSerializer serializer = new XMLSerializer(fos, format);
        serializer.serialize(dom);
    } catch (Exception e) {
        throw new FileSystemSnapshotException("Error saving " + backupFile, e);
    } finally {
        IoUtils.closeStream(fos, "Error closing IO stream to file used for file system snapshot backup " + backupFile);
    }
    log.info("SAVE TO FILE " + backupFile + " - END");
}
Also used : XMLSerializer(org.apache.xml.serialize.XMLSerializer) Element(org.w3c.dom.Element) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) FileOutputStream(java.io.FileOutputStream) OutputFormat(org.apache.xml.serialize.OutputFormat) Document(org.w3c.dom.Document) FileSystemSnapshotException(com.axway.ats.common.filesystem.snapshot.FileSystemSnapshotException) File(java.io.File) FileSystemSnapshotException(com.axway.ats.common.filesystem.snapshot.FileSystemSnapshotException)

Aggregations

OutputFormat (org.apache.xml.serialize.OutputFormat)19 XMLSerializer (org.apache.xml.serialize.XMLSerializer)19 FileOutputStream (java.io.FileOutputStream)5 Document (org.w3c.dom.Document)5 File (java.io.File)4 IOException (java.io.IOException)4 OutputStream (java.io.OutputStream)3 StringWriter (java.io.StringWriter)2 DocumentBuilder (javax.xml.parsers.DocumentBuilder)2 DOMSerializer (org.apache.xml.serialize.DOMSerializer)2 Element (org.w3c.dom.Element)2 SAXException (org.xml.sax.SAXException)2 SkipColumns (com.axway.ats.action.dbaccess.snapshot.rules.SkipColumns)1 SkipContent (com.axway.ats.action.dbaccess.snapshot.rules.SkipContent)1 SkipIndexAttributes (com.axway.ats.action.dbaccess.snapshot.rules.SkipIndexAttributes)1 SkipRows (com.axway.ats.action.dbaccess.snapshot.rules.SkipRows)1 DatabaseSnapshotException (com.axway.ats.common.dbaccess.snapshot.DatabaseSnapshotException)1 TableDescription (com.axway.ats.common.dbaccess.snapshot.TableDescription)1 FileSystemSnapshotException (com.axway.ats.common.filesystem.snapshot.FileSystemSnapshotException)1 AtsConfigurationException (com.axway.ats.core.atsconfig.exceptions.AtsConfigurationException)1