Search in sources :

Example 1 with TikaException

use of org.apache.tika.exception.TikaException in project che by eclipse.

the class MediaTypeFilter method accept.

@Override
public boolean accept(VirtualFile file) {
    try (InputStream content = file.getContent()) {
        TikaConfig tikaConfig = new TikaConfig();
        MediaType mimeType = tikaConfig.getDetector().detect(content, new Metadata());
        if (excludedMediaTypes.contains(mimeType) || excludedTypes.contains(mimeType.getType())) {
            return true;
        }
        return false;
    } catch (TikaException | ForbiddenException | ServerException | IOException e) {
        return true;
    }
}
Also used : ForbiddenException(org.eclipse.che.api.core.ForbiddenException) TikaException(org.apache.tika.exception.TikaException) ServerException(org.eclipse.che.api.core.ServerException) TikaConfig(org.apache.tika.config.TikaConfig) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException)

Example 2 with TikaException

use of org.apache.tika.exception.TikaException in project lucene-solr by apache.

the class ExtractingDocumentLoader method load.

@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
        //Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
        MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
        parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
        parser = autoDetectParser;
    }
    if (parser != null) {
        Metadata metadata = new Metadata();
        // If you specify the resource name (the filename, roughly) with this parameter,
        // then Tika can make use of it in guessing the appropriate MIME type:
        String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
        if (resourceName != null) {
            metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
        }
        // Provide stream's content type as hint for auto detection
        if (stream.getContentType() != null) {
            metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
        }
        InputStream inputStream = null;
        try {
            inputStream = stream.getStream();
            metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
            metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
            metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
            metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
            // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
            String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
            if (charset != null) {
                metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
            }
            String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
            boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
            SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
            ContentHandler parsingHandler = handler;
            StringWriter writer = null;
            BaseMarkupSerializer serializer = null;
            if (extractOnly == true) {
                String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
                writer = new StringWriter();
                if (extractFormat.equals(TEXT_FORMAT)) {
                    serializer = new TextSerializer();
                    serializer.setOutputCharStream(writer);
                    serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
                } else {
                    serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
                }
                if (xpathExpr != null) {
                    Matcher matcher = PARSER.parse(xpathExpr);
                    //The MatchingContentHandler does not invoke startDocument.  See http://tika.markmail.org/message/kknu3hw7argwiqin
                    serializer.startDocument();
                    parsingHandler = new MatchingContentHandler(serializer, matcher);
                } else {
                    parsingHandler = serializer;
                }
            } else if (xpathExpr != null) {
                Matcher matcher = PARSER.parse(xpathExpr);
                parsingHandler = new MatchingContentHandler(handler, matcher);
            }
            try {
                //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
                ParseContext context = parseContextConfig.create();
                context.set(Parser.class, parser);
                context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
                // Password handling
                RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
                String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
                if (pwMapFile != null && pwMapFile.length() > 0) {
                    InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
                    if (is != null) {
                        log.debug("Password file supplied: " + pwMapFile);
                        epp.parse(is);
                    }
                }
                context.set(PasswordProvider.class, epp);
                String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
                if (resourcePassword != null) {
                    epp.setExplicitPassword(resourcePassword);
                    log.debug("Literal password supplied for file " + resourceName);
                }
                parser.parse(inputStream, parsingHandler, metadata, context);
            } catch (TikaException e) {
                if (ignoreTikaException)
                    log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage()).append(". metadata=").append(metadata.toString()).toString());
                else
                    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
            }
            if (extractOnly == false) {
                addDoc(handler);
            } else {
                //serializer is not null, so we need to call endDoc on it if using xpath
                if (xpathExpr != null) {
                    serializer.endDocument();
                }
                rsp.add(stream.getName(), writer.toString());
                writer.close();
                String[] names = metadata.names();
                NamedList metadataNL = new NamedList();
                for (int i = 0; i < names.length; i++) {
                    String[] vals = metadata.getValues(names[i]);
                    metadataNL.add(names[i], vals);
                }
                rsp.add(stream.getName() + "_metadata", metadataNL);
            }
        } catch (SAXException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        } finally {
            IOUtils.closeQuietly(inputStream);
        }
    } else {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers.  Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
    }
}
Also used : Matcher(org.apache.tika.sax.xpath.Matcher) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) Metadata(org.apache.tika.metadata.Metadata) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) SAXException(org.xml.sax.SAXException) StringWriter(java.io.StringWriter) MediaType(org.apache.tika.mime.MediaType) SolrException(org.apache.solr.common.SolrException) DefaultParser(org.apache.tika.parser.DefaultParser) XMLSerializer(org.apache.xml.serialize.XMLSerializer) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) NamedList(org.apache.solr.common.util.NamedList) BaseMarkupSerializer(org.apache.xml.serialize.BaseMarkupSerializer) OutputFormat(org.apache.xml.serialize.OutputFormat) Parser(org.apache.tika.parser.Parser) XPathParser(org.apache.tika.sax.xpath.XPathParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) TextSerializer(org.apache.xml.serialize.TextSerializer) ParseContext(org.apache.tika.parser.ParseContext)

Example 3 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class LanguageProfilerBuilder method create.

/**
     * Creates a new Language profile from (preferably quite large - 5-10k of
     * lines) text file
     * 
     * @param name to be given for the profile
     * @param is a stream to be read
     * @param encoding is the encoding of stream
     * 
     * @throws TikaException if could not create a language profile
     *  
     */
public static LanguageProfilerBuilder create(String name, InputStream is, String encoding) throws TikaException {
    LanguageProfilerBuilder newProfile = new LanguageProfilerBuilder(name, ABSOLUTE_MIN_NGRAM_LENGTH, ABSOLUTE_MAX_NGRAM_LENGTH);
    BufferedInputStream bis = new BufferedInputStream(is);
    byte[] buffer = new byte[4096];
    StringBuilder text = new StringBuilder();
    int len;
    try {
        while ((len = bis.read(buffer)) != -1) {
            text.append(new String(buffer, 0, len, encoding));
        }
    } catch (IOException e) {
        throw new TikaException("Could not create profile, " + e.getMessage());
    }
    newProfile.analyze(text);
    return newProfile;
}
Also used : TikaException(org.apache.tika.exception.TikaException) BufferedInputStream(java.io.BufferedInputStream) IOException(java.io.IOException)

Example 4 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class LanguageProfilerBuilder method main.

/**
     * main method used for testing only
     * 
     * @param args
     */
public static void main(String[] args) {
    // -create he sample_he.txt utf-8
    String usage = "Usage: NGramProfile " + "[-create profilename filename encoding] " + "[-similarity file1 file2] " + "[-score profile-name filename encoding]";
    int command = 0;
    final int CREATE = 1;
    final int SIMILARITY = 2;
    final int SCORE = 3;
    String profilename = "";
    String filename = "";
    String filename2 = "";
    String encoding = "";
    if (args.length == 0) {
        System.err.println(usage);
        System.exit(-1);
    }
    for (int i = 0; i < args.length; i++) {
        // parse command line
        if (args[i].equals("-create")) {
            // found -create option
            command = CREATE;
            profilename = args[++i];
            filename = args[++i];
            encoding = args[++i];
        }
        if (args[i].equals("-similarity")) {
            // found -similarity option
            command = SIMILARITY;
            filename = args[++i];
            filename2 = args[++i];
            encoding = args[++i];
        }
        if (args[i].equals("-score")) {
            // found -Score option
            command = SCORE;
            profilename = args[++i];
            filename = args[++i];
            encoding = args[++i];
        }
    }
    try {
        switch(command) {
            case CREATE:
                File f = new File(filename);
                FileInputStream fis = new FileInputStream(f);
                LanguageProfilerBuilder newProfile = LanguageProfilerBuilder.create(profilename, fis, encoding);
                fis.close();
                f = new File(profilename + "." + FILE_EXTENSION);
                FileOutputStream fos = new FileOutputStream(f);
                newProfile.save(fos);
                System.out.println("new profile " + profilename + "." + FILE_EXTENSION + " was created.");
                break;
            case SIMILARITY:
                f = new File(filename);
                fis = new FileInputStream(f);
                newProfile = LanguageProfilerBuilder.create(filename, fis, encoding);
                newProfile.normalize();
                f = new File(filename2);
                fis = new FileInputStream(f);
                LanguageProfilerBuilder newProfile2 = LanguageProfilerBuilder.create(filename2, fis, encoding);
                newProfile2.normalize();
                System.out.println("Similarity is " + newProfile.getSimilarity(newProfile2));
                break;
            case SCORE:
                f = new File(filename);
                fis = new FileInputStream(f);
                newProfile = LanguageProfilerBuilder.create(filename, fis, encoding);
                f = new File(profilename + "." + FILE_EXTENSION);
                fis = new FileInputStream(f);
                LanguageProfilerBuilder compare = new LanguageProfilerBuilder(profilename, DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH);
                compare.load(fis);
                System.out.println("Score is " + compare.getSimilarity(newProfile));
                break;
        }
    } catch (Exception e) {
        e.printStackTrace();
    // throw new TikaException("");
    }
}
Also used : FileOutputStream(java.io.FileOutputStream) File(java.io.File) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) TikaException(org.apache.tika.exception.TikaException)

Example 5 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class ExternalParser method parse.

private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata metadata, TemporaryResources tmp) throws IOException, SAXException, TikaException {
    boolean inputToStdIn = true;
    boolean outputFromStdOut = true;
    boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());
    File output = null;
    // Build our command
    String[] cmd;
    if (command.length == 1) {
        cmd = command[0].split(" ");
    } else {
        cmd = new String[command.length];
        System.arraycopy(command, 0, cmd, 0, command.length);
    }
    for (int i = 0; i < cmd.length; i++) {
        if (cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
            cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, stream.getFile().getPath());
            inputToStdIn = false;
        }
        if (cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
            output = tmp.createTemporaryFile();
            outputFromStdOut = false;
            cmd[i] = cmd[i].replace(OUTPUT_FILE_TOKEN, output.getPath());
        }
    }
    // Execute
    Process process = null;
    try {
        if (cmd.length == 1) {
            process = Runtime.getRuntime().exec(cmd[0]);
        } else {
            process = Runtime.getRuntime().exec(cmd);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    try {
        if (inputToStdIn) {
            sendInput(process, stream);
        } else {
            process.getOutputStream().close();
        }
        InputStream out = process.getInputStream();
        InputStream err = process.getErrorStream();
        if (hasPatterns) {
            extractMetadata(err, metadata);
            if (outputFromStdOut) {
                extractOutput(out, xhtml);
            } else {
                extractMetadata(out, metadata);
            }
        } else {
            ignoreStream(err);
            if (outputFromStdOut) {
                extractOutput(out, xhtml);
            } else {
                ignoreStream(out);
            }
        }
    } finally {
        try {
            process.waitFor();
        } catch (InterruptedException ignore) {
        }
    }
    // Grab the output if we haven't already
    if (!outputFromStdOut) {
        extractOutput(new FileInputStream(output), xhtml);
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) File(java.io.File) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException) FileInputStream(java.io.FileInputStream)

Aggregations

TikaException (org.apache.tika.exception.TikaException)142 IOException (java.io.IOException)54 SAXException (org.xml.sax.SAXException)42 InputStream (java.io.InputStream)37 TikaInputStream (org.apache.tika.io.TikaInputStream)33 Metadata (org.apache.tika.metadata.Metadata)33 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)29 Test (org.junit.Test)19 ParseContext (org.apache.tika.parser.ParseContext)18 ContentHandler (org.xml.sax.ContentHandler)17 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)16 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)15 TemporaryResources (org.apache.tika.io.TemporaryResources)15 MediaType (org.apache.tika.mime.MediaType)13 Parser (org.apache.tika.parser.Parser)13 ByteArrayInputStream (java.io.ByteArrayInputStream)12 ArrayList (java.util.ArrayList)11 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)11 File (java.io.File)8 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)8