Search in sources :

Example 36 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class TrainedModelDetector method detect.

public MediaType detect(InputStream input, Metadata metadata) throws IOException {
    // convert to byte-histogram
    if (input != null) {
        input.mark(getMinLength());
        float[] histogram = readByteFrequencies(input);
        // writeHisto(histogram); //on testing purpose
        /*
             * iterate the map to find out the one that gives the higher
             * prediction value.
             */
        Iterator<MediaType> iter = MODEL_MAP.keySet().iterator();
        // probability threshold, any value below the
        float threshold = 0.5f;
        // threshold will be considered as
        // MediaType.OCTET_STREAM
        float maxprob = threshold;
        MediaType maxType = MediaType.OCTET_STREAM;
        while (iter.hasNext()) {
            MediaType key = iter.next();
            TrainedModel model = MODEL_MAP.get(key);
            float prob = model.predict(histogram);
            if (maxprob < prob) {
                maxprob = prob;
                maxType = key;
            }
        }
        input.reset();
        return maxType;
    }
    return null;
}
Also used : MediaType(org.apache.tika.mime.MediaType)

Example 37 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class TikaConfig method mediaTypesListFromDomElement.

private static Set<MediaType> mediaTypesListFromDomElement(Element node, String tag) throws TikaException, IOException {
    Set<MediaType> types = null;
    NodeList children = node.getChildNodes();
    for (int i = 0; i < children.getLength(); i++) {
        Node cNode = children.item(i);
        if (cNode instanceof Element) {
            Element cElement = (Element) cNode;
            if (tag.equals(cElement.getTagName())) {
                String mime = getText(cElement);
                MediaType type = MediaType.parse(mime);
                if (type != null) {
                    if (types == null)
                        types = new HashSet<>();
                    types.add(type);
                } else {
                    throw new TikaException("Invalid media type name: " + mime);
                }
            }
        }
    }
    if (types != null)
        return types;
    return Collections.emptySet();
}
Also used : TikaException(org.apache.tika.exception.TikaException) NodeList(org.w3c.dom.NodeList) Node(org.w3c.dom.Node) Element(org.w3c.dom.Element) MediaType(org.apache.tika.mime.MediaType) HashSet(java.util.HashSet)

Example 38 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class NNExampleModelDetector method readDescription.

/**
     * read the comments where the model configuration is written, e.g the
     * number of inputs, hiddens and output please ensure the first char in the
     * given string is # In this example grb model file, there are 4 elements 1)
     * type 2) number of input units 3) number of hidden units. 4) number of
     * output units.
     */
private void readDescription(final NNTrainedModelBuilder builder, final String line) {
    int numInputs;
    int numHidden;
    int numOutputs;
    String[] sarr = line.split("\t");
    try {
        MediaType type = MediaType.parse(sarr[1]);
        numInputs = Integer.parseInt(sarr[2]);
        numHidden = Integer.parseInt(sarr[3]);
        numOutputs = Integer.parseInt(sarr[4]);
        builder.setNumOfInputs(numInputs);
        builder.setNumOfHidden(numHidden);
        builder.setNumOfOutputs(numOutputs);
        builder.setType(type);
    } catch (Exception e) {
        if (log.isLoggable(Level.WARNING)) {
            log.log(Level.WARNING, "Unable to parse the model configuration", e);
        }
        throw new RuntimeException("Unable to parse the model configuration", e);
    }
}
Also used : MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException)

Example 39 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class TesseractOCRParserTest method offersTypesIfFound.

/*
    If Tesseract is found, test we retrieve the proper number of supporting Parsers.
     */
@Test
public void offersTypesIfFound() throws Exception {
    TesseractOCRParser parser = new TesseractOCRParser();
    DefaultParser defaultParser = new DefaultParser();
    ParseContext parseContext = new ParseContext();
    MediaType png = MediaType.image("png");
    // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.
    assumeTrue(canRun());
    assertEquals(8, parser.getSupportedTypes(parseContext).size());
    assertTrue(parser.getSupportedTypes(parseContext).contains(png));
    // DefaultParser will now select the TesseractOCRParser.
    assertEquals(TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) MediaType(org.apache.tika.mime.MediaType) DefaultParser(org.apache.tika.parser.DefaultParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 40 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class CompressorParserTest method testCoverage.

@Test
public void testCoverage() throws Exception {
    //test that the package parser covers all inputstreams handled
    //by CompressorStreamFactory.  When we update commons-compress, and they add
    //a new stream type, we want to make sure that we're handling it.
    CompressorStreamFactory archiveStreamFactory = new CompressorStreamFactory(true, 1000);
    CompressorParser compressorParser = new CompressorParser();
    ParseContext parseContext = new ParseContext();
    for (String name : archiveStreamFactory.getInputStreamCompressorNames()) {
        MediaType mt = CompressorParser.getMediaType(name);
        if (NOT_COVERED.contains(mt)) {
            continue;
        }
        //name of the missing stream
        if (mt.equals(MediaType.OCTET_STREAM)) {
            fail("getting octet-stream for: " + name);
        }
        if (!compressorParser.getSupportedTypes(parseContext).contains(mt)) {
            fail("CompressorParser should support: " + mt.toString());
        }
    }
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) CompressorStreamFactory(org.apache.commons.compress.compressors.CompressorStreamFactory) MediaType(org.apache.tika.mime.MediaType) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

MediaType (org.apache.tika.mime.MediaType)95 Metadata (org.apache.tika.metadata.Metadata)29 Test (org.junit.Test)28 InputStream (java.io.InputStream)26 IOException (java.io.IOException)18 Parser (org.apache.tika.parser.Parser)18 TikaInputStream (org.apache.tika.io.TikaInputStream)17 ParseContext (org.apache.tika.parser.ParseContext)17 TikaException (org.apache.tika.exception.TikaException)14 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)14 CompositeParser (org.apache.tika.parser.CompositeParser)13 ContentHandler (org.xml.sax.ContentHandler)13 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)12 Detector (org.apache.tika.detect.Detector)11 TikaTest (org.apache.tika.TikaTest)10 HashSet (java.util.HashSet)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 ArrayList (java.util.ArrayList)7 TikaConfig (org.apache.tika.config.TikaConfig)7 MediaTypeRegistry (org.apache.tika.mime.MediaTypeRegistry)7