use of org.apache.xml.serialize.OutputFormat in project lucene-solr by apache.
the class ExtractingDocumentLoader method load.
@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
Parser parser = null;
String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
if (streamType != null) {
//Cache? Parsers are lightweight to construct and thread-safe, so I'm told
MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
} else {
parser = autoDetectParser;
}
if (parser != null) {
Metadata metadata = new Metadata();
// If you specify the resource name (the filename, roughly) with this parameter,
// then Tika can make use of it in guessing the appropriate MIME type:
String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
if (resourceName != null) {
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
}
// Provide stream's content type as hint for auto detection
if (stream.getContentType() != null) {
metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
}
InputStream inputStream = null;
try {
inputStream = stream.getStream();
metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
// HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
if (charset != null) {
metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
}
String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
ContentHandler parsingHandler = handler;
StringWriter writer = null;
BaseMarkupSerializer serializer = null;
if (extractOnly == true) {
String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
writer = new StringWriter();
if (extractFormat.equals(TEXT_FORMAT)) {
serializer = new TextSerializer();
serializer.setOutputCharStream(writer);
serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
} else {
serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
}
if (xpathExpr != null) {
Matcher matcher = PARSER.parse(xpathExpr);
//The MatchingContentHandler does not invoke startDocument. See http://tika.markmail.org/message/kknu3hw7argwiqin
serializer.startDocument();
parsingHandler = new MatchingContentHandler(serializer, matcher);
} else {
parsingHandler = serializer;
}
} else if (xpathExpr != null) {
Matcher matcher = PARSER.parse(xpathExpr);
parsingHandler = new MatchingContentHandler(handler, matcher);
}
try {
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
ParseContext context = parseContextConfig.create();
context.set(Parser.class, parser);
context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
// Password handling
RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
if (pwMapFile != null && pwMapFile.length() > 0) {
InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
if (is != null) {
log.debug("Password file supplied: " + pwMapFile);
epp.parse(is);
}
}
context.set(PasswordProvider.class, epp);
String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
if (resourcePassword != null) {
epp.setExplicitPassword(resourcePassword);
log.debug("Literal password supplied for file " + resourceName);
}
parser.parse(inputStream, parsingHandler, metadata, context);
} catch (TikaException e) {
if (ignoreTikaException)
log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage()).append(". metadata=").append(metadata.toString()).toString());
else
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
if (extractOnly == false) {
addDoc(handler);
} else {
//serializer is not null, so we need to call endDoc on it if using xpath
if (xpathExpr != null) {
serializer.endDocument();
}
rsp.add(stream.getName(), writer.toString());
writer.close();
String[] names = metadata.names();
NamedList metadataNL = new NamedList();
for (int i = 0; i < names.length; i++) {
String[] vals = metadata.getValues(names[i]);
metadataNL.add(names[i], vals);
}
rsp.add(stream.getName() + "_metadata", metadataNL);
}
} catch (SAXException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} finally {
IOUtils.closeQuietly(inputStream);
}
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
}
}
use of org.apache.xml.serialize.OutputFormat in project sirix by sirixdb.
the class ExtractArticles method main.
/**
* Main method.
*
* @param pArgs
* First param specifies the Wikipedia dump to parse.
*/
public static void main(final String[] pArgs) {
if (pArgs.length != 1) {
new IllegalStateException("First parameter must be the wikipedia dump!");
}
start = System.nanoTime();
System.out.print("Start extracting articles... ");
final String wikiDump = new File(pArgs[0]).getAbsolutePath();
final XMLReader parser = new ExtractArticles(new SAXParser());
if (parser != null) {
try {
TARGET.delete();
TARGET.createNewFile();
final XMLSerializer printer = new XMLSerializer(new FileWriter(TARGET), new OutputFormat());
parser.setContentHandler(printer);
parser.parse(wikiDump);
} catch (final IOException | SAXException e) {
LOGWRAPPER.error(e.getMessage(), e);
}
}
}
use of org.apache.xml.serialize.OutputFormat in project airavata by apache.
the class XmlFormatter method format.
/**
* @param unformattedXml
* @return formattedXml
*/
public static String format(String unformattedXml) {
try {
final Document document = parseXmlFile(unformattedXml);
OutputFormat format = new OutputFormat(document);
format.setLineWidth(65);
format.setIndenting(true);
format.setIndent(2);
Writer out = new StringWriter();
XMLSerializer serializer = new XMLSerializer(out, format);
serializer.serialize(document);
return out.toString();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of org.apache.xml.serialize.OutputFormat in project ats-framework by Axway.
the class AtsProjectConfiguration method save.
public void save() throws AtsConfigurationException {
// save the XML file
try {
OutputFormat format = new OutputFormat(doc);
format.setIndenting(true);
format.setIndent(4);
format.setLineWidth(1000);
XMLSerializer serializer = new XMLSerializer(new FileOutputStream(new File(atsConfigurationFile)), format);
serializer.serialize(doc);
} catch (Exception e) {
throw new AtsConfigurationException("Error saving ATS configuration in '" + atsConfigurationFile + "'", e);
}
}
use of org.apache.xml.serialize.OutputFormat in project ats-framework by Axway.
the class LocalFileSystemSnapshot method toFile.
@Override
public void toFile(String backupFile) {
log.info("SAVE TO FILE " + backupFile + " - START");
// create the directory if does not exist
File dirPath = new File(IoUtils.getFilePath(backupFile));
if (!dirPath.exists()) {
dirPath.mkdirs();
}
Document dom;
try {
dom = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
} catch (Exception e) {
throw new FileSystemSnapshotException("Error creating DOM parser for " + backupFile, e);
}
// TODO - add DTD or schema for manual creation and easy validation
Element fileSystemNode = dom.createElement(NODE_FILE_SYSTEM);
fileSystemNode.setAttribute("name", this.name);
fileSystemNode.setAttribute("time", SnapshotUtils.dateToString(this.snapshotTimestamp));
dom.appendChild(fileSystemNode);
for (String dirSnapshotName : this.dirSnapshots.keySet()) {
Element dirSnapshotNode = dom.createElement(NODE_DIRECTORY);
fileSystemNode.appendChild(dirSnapshotNode);
dirSnapshotNode.setAttribute("alias", dirSnapshotName);
this.dirSnapshots.get(dirSnapshotName).toFile(dom, dirSnapshotNode);
}
// save the XML file
OutputStream fos = null;
try {
OutputFormat format = new OutputFormat(dom);
format.setIndenting(true);
format.setIndent(4);
format.setLineWidth(1000);
fos = new FileOutputStream(new File(backupFile));
XMLSerializer serializer = new XMLSerializer(fos, format);
serializer.serialize(dom);
} catch (Exception e) {
throw new FileSystemSnapshotException("Error saving " + backupFile, e);
} finally {
IoUtils.closeStream(fos, "Error closing IO stream to file used for file system snapshot backup " + backupFile);
}
log.info("SAVE TO FILE " + backupFile + " - END");
}
Aggregations