use of org.apache.tika.mime.MediaType in project tika by apache.
the class TrainedModelDetector method detect.
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
// convert to byte-histogram
if (input != null) {
input.mark(getMinLength());
float[] histogram = readByteFrequencies(input);
// writeHisto(histogram); //on testing purpose
/*
* iterate the map to find out the one that gives the higher
* prediction value.
*/
Iterator<MediaType> iter = MODEL_MAP.keySet().iterator();
// probability threshold, any value below the
float threshold = 0.5f;
// threshold will be considered as
// MediaType.OCTET_STREAM
float maxprob = threshold;
MediaType maxType = MediaType.OCTET_STREAM;
while (iter.hasNext()) {
MediaType key = iter.next();
TrainedModel model = MODEL_MAP.get(key);
float prob = model.predict(histogram);
if (maxprob < prob) {
maxprob = prob;
maxType = key;
}
}
input.reset();
return maxType;
}
return null;
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class TikaConfig method mediaTypesListFromDomElement.
private static Set<MediaType> mediaTypesListFromDomElement(Element node, String tag) throws TikaException, IOException {
Set<MediaType> types = null;
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
Node cNode = children.item(i);
if (cNode instanceof Element) {
Element cElement = (Element) cNode;
if (tag.equals(cElement.getTagName())) {
String mime = getText(cElement);
MediaType type = MediaType.parse(mime);
if (type != null) {
if (types == null)
types = new HashSet<>();
types.add(type);
} else {
throw new TikaException("Invalid media type name: " + mime);
}
}
}
}
if (types != null)
return types;
return Collections.emptySet();
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class NNExampleModelDetector method readDescription.
/**
* read the comments where the model configuration is written, e.g the
* number of inputs, hiddens and output please ensure the first char in the
* given string is # In this example grb model file, there are 4 elements 1)
* type 2) number of input units 3) number of hidden units. 4) number of
* output units.
*/
private void readDescription(final NNTrainedModelBuilder builder, final String line) {
int numInputs;
int numHidden;
int numOutputs;
String[] sarr = line.split("\t");
try {
MediaType type = MediaType.parse(sarr[1]);
numInputs = Integer.parseInt(sarr[2]);
numHidden = Integer.parseInt(sarr[3]);
numOutputs = Integer.parseInt(sarr[4]);
builder.setNumOfInputs(numInputs);
builder.setNumOfHidden(numHidden);
builder.setNumOfOutputs(numOutputs);
builder.setType(type);
} catch (Exception e) {
if (log.isLoggable(Level.WARNING)) {
log.log(Level.WARNING, "Unable to parse the model configuration", e);
}
throw new RuntimeException("Unable to parse the model configuration", e);
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class TesseractOCRParserTest method offersTypesIfFound.
/*
If Tesseract is found, test we retrieve the proper number of supporting Parsers.
*/
@Test
public void offersTypesIfFound() throws Exception {
TesseractOCRParser parser = new TesseractOCRParser();
DefaultParser defaultParser = new DefaultParser();
ParseContext parseContext = new ParseContext();
MediaType png = MediaType.image("png");
// Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.
assumeTrue(canRun());
assertEquals(8, parser.getSupportedTypes(parseContext).size());
assertTrue(parser.getSupportedTypes(parseContext).contains(png));
// DefaultParser will now select the TesseractOCRParser.
assertEquals(TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class CompressorParserTest method testCoverage.
@Test
public void testCoverage() throws Exception {
//test that the package parser covers all inputstreams handled
//by CompressorStreamFactory. When we update commons-compress, and they add
//a new stream type, we want to make sure that we're handling it.
CompressorStreamFactory archiveStreamFactory = new CompressorStreamFactory(true, 1000);
CompressorParser compressorParser = new CompressorParser();
ParseContext parseContext = new ParseContext();
for (String name : archiveStreamFactory.getInputStreamCompressorNames()) {
MediaType mt = CompressorParser.getMediaType(name);
if (NOT_COVERED.contains(mt)) {
continue;
}
//name of the missing stream
if (mt.equals(MediaType.OCTET_STREAM)) {
fail("getting octet-stream for: " + name);
}
if (!compressorParser.getSupportedTypes(parseContext).contains(mt)) {
fail("CompressorParser should support: " + mt.toString());
}
}
}
Aggregations