use of org.apache.tika.exception.TikaException in project che by eclipse.
the class MediaTypeFilter method accept.
@Override
public boolean accept(VirtualFile file) {
try (InputStream content = file.getContent()) {
TikaConfig tikaConfig = new TikaConfig();
MediaType mimeType = tikaConfig.getDetector().detect(content, new Metadata());
if (excludedMediaTypes.contains(mimeType) || excludedTypes.contains(mimeType.getType())) {
return true;
}
return false;
} catch (TikaException | ForbiddenException | ServerException | IOException e) {
return true;
}
}
use of org.apache.tika.exception.TikaException in project lucene-solr by apache.
the class ExtractingDocumentLoader method load.
@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
Parser parser = null;
String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
if (streamType != null) {
//Cache? Parsers are lightweight to construct and thread-safe, so I'm told
MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
} else {
parser = autoDetectParser;
}
if (parser != null) {
Metadata metadata = new Metadata();
// If you specify the resource name (the filename, roughly) with this parameter,
// then Tika can make use of it in guessing the appropriate MIME type:
String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
if (resourceName != null) {
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
}
// Provide stream's content type as hint for auto detection
if (stream.getContentType() != null) {
metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
}
InputStream inputStream = null;
try {
inputStream = stream.getStream();
metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
// HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
if (charset != null) {
metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
}
String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
ContentHandler parsingHandler = handler;
StringWriter writer = null;
BaseMarkupSerializer serializer = null;
if (extractOnly == true) {
String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
writer = new StringWriter();
if (extractFormat.equals(TEXT_FORMAT)) {
serializer = new TextSerializer();
serializer.setOutputCharStream(writer);
serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
} else {
serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
}
if (xpathExpr != null) {
Matcher matcher = PARSER.parse(xpathExpr);
//The MatchingContentHandler does not invoke startDocument. See http://tika.markmail.org/message/kknu3hw7argwiqin
serializer.startDocument();
parsingHandler = new MatchingContentHandler(serializer, matcher);
} else {
parsingHandler = serializer;
}
} else if (xpathExpr != null) {
Matcher matcher = PARSER.parse(xpathExpr);
parsingHandler = new MatchingContentHandler(handler, matcher);
}
try {
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
ParseContext context = parseContextConfig.create();
context.set(Parser.class, parser);
context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
// Password handling
RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
if (pwMapFile != null && pwMapFile.length() > 0) {
InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
if (is != null) {
log.debug("Password file supplied: " + pwMapFile);
epp.parse(is);
}
}
context.set(PasswordProvider.class, epp);
String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
if (resourcePassword != null) {
epp.setExplicitPassword(resourcePassword);
log.debug("Literal password supplied for file " + resourceName);
}
parser.parse(inputStream, parsingHandler, metadata, context);
} catch (TikaException e) {
if (ignoreTikaException)
log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage()).append(". metadata=").append(metadata.toString()).toString());
else
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
if (extractOnly == false) {
addDoc(handler);
} else {
//serializer is not null, so we need to call endDoc on it if using xpath
if (xpathExpr != null) {
serializer.endDocument();
}
rsp.add(stream.getName(), writer.toString());
writer.close();
String[] names = metadata.names();
NamedList metadataNL = new NamedList();
for (int i = 0; i < names.length; i++) {
String[] vals = metadata.getValues(names[i]);
metadataNL.add(names[i], vals);
}
rsp.add(stream.getName() + "_metadata", metadataNL);
}
} catch (SAXException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} finally {
IOUtils.closeQuietly(inputStream);
}
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class LanguageProfilerBuilder method create.
/**
* Creates a new Language profile from (preferably quite large - 5-10k of
* lines) text file
*
* @param name to be given for the profile
* @param is a stream to be read
* @param encoding is the encoding of stream
*
* @throws TikaException if could not create a language profile
*
*/
public static LanguageProfilerBuilder create(String name, InputStream is, String encoding) throws TikaException {
LanguageProfilerBuilder newProfile = new LanguageProfilerBuilder(name, ABSOLUTE_MIN_NGRAM_LENGTH, ABSOLUTE_MAX_NGRAM_LENGTH);
BufferedInputStream bis = new BufferedInputStream(is);
byte[] buffer = new byte[4096];
StringBuilder text = new StringBuilder();
int len;
try {
while ((len = bis.read(buffer)) != -1) {
text.append(new String(buffer, 0, len, encoding));
}
} catch (IOException e) {
throw new TikaException("Could not create profile, " + e.getMessage());
}
newProfile.analyze(text);
return newProfile;
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class LanguageProfilerBuilder method main.
/**
* main method used for testing only
*
* @param args
*/
public static void main(String[] args) {
// -create he sample_he.txt utf-8
String usage = "Usage: NGramProfile " + "[-create profilename filename encoding] " + "[-similarity file1 file2] " + "[-score profile-name filename encoding]";
int command = 0;
final int CREATE = 1;
final int SIMILARITY = 2;
final int SCORE = 3;
String profilename = "";
String filename = "";
String filename2 = "";
String encoding = "";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) {
// parse command line
if (args[i].equals("-create")) {
// found -create option
command = CREATE;
profilename = args[++i];
filename = args[++i];
encoding = args[++i];
}
if (args[i].equals("-similarity")) {
// found -similarity option
command = SIMILARITY;
filename = args[++i];
filename2 = args[++i];
encoding = args[++i];
}
if (args[i].equals("-score")) {
// found -Score option
command = SCORE;
profilename = args[++i];
filename = args[++i];
encoding = args[++i];
}
}
try {
switch(command) {
case CREATE:
File f = new File(filename);
FileInputStream fis = new FileInputStream(f);
LanguageProfilerBuilder newProfile = LanguageProfilerBuilder.create(profilename, fis, encoding);
fis.close();
f = new File(profilename + "." + FILE_EXTENSION);
FileOutputStream fos = new FileOutputStream(f);
newProfile.save(fos);
System.out.println("new profile " + profilename + "." + FILE_EXTENSION + " was created.");
break;
case SIMILARITY:
f = new File(filename);
fis = new FileInputStream(f);
newProfile = LanguageProfilerBuilder.create(filename, fis, encoding);
newProfile.normalize();
f = new File(filename2);
fis = new FileInputStream(f);
LanguageProfilerBuilder newProfile2 = LanguageProfilerBuilder.create(filename2, fis, encoding);
newProfile2.normalize();
System.out.println("Similarity is " + newProfile.getSimilarity(newProfile2));
break;
case SCORE:
f = new File(filename);
fis = new FileInputStream(f);
newProfile = LanguageProfilerBuilder.create(filename, fis, encoding);
f = new File(profilename + "." + FILE_EXTENSION);
fis = new FileInputStream(f);
LanguageProfilerBuilder compare = new LanguageProfilerBuilder(profilename, DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH);
compare.load(fis);
System.out.println("Score is " + compare.getSimilarity(newProfile));
break;
}
} catch (Exception e) {
e.printStackTrace();
// throw new TikaException("");
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class ExternalParser method parse.
private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata metadata, TemporaryResources tmp) throws IOException, SAXException, TikaException {
boolean inputToStdIn = true;
boolean outputFromStdOut = true;
boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());
File output = null;
// Build our command
String[] cmd;
if (command.length == 1) {
cmd = command[0].split(" ");
} else {
cmd = new String[command.length];
System.arraycopy(command, 0, cmd, 0, command.length);
}
for (int i = 0; i < cmd.length; i++) {
if (cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, stream.getFile().getPath());
inputToStdIn = false;
}
if (cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
output = tmp.createTemporaryFile();
outputFromStdOut = false;
cmd[i] = cmd[i].replace(OUTPUT_FILE_TOKEN, output.getPath());
}
}
// Execute
Process process = null;
try {
if (cmd.length == 1) {
process = Runtime.getRuntime().exec(cmd[0]);
} else {
process = Runtime.getRuntime().exec(cmd);
}
} catch (Exception e) {
e.printStackTrace();
}
try {
if (inputToStdIn) {
sendInput(process, stream);
} else {
process.getOutputStream().close();
}
InputStream out = process.getInputStream();
InputStream err = process.getErrorStream();
if (hasPatterns) {
extractMetadata(err, metadata);
if (outputFromStdOut) {
extractOutput(out, xhtml);
} else {
extractMetadata(out, metadata);
}
} else {
ignoreStream(err);
if (outputFromStdOut) {
extractOutput(out, xhtml);
} else {
ignoreStream(out);
}
}
} finally {
try {
process.waitFor();
} catch (InterruptedException ignore) {
}
}
// Grab the output if we haven't already
if (!outputFromStdOut) {
extractOutput(new FileInputStream(output), xhtml);
}
}
Aggregations