Search in sources :

Example 6 with Metadata

use of org.apache.tika.metadata.Metadata in project crawler4j by yasserg.

the class BinaryParseData method setBinaryContent.

public void setBinaryContent(byte[] data) {
    InputStream inputStream = new ByteArrayInputStream(data);
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
    try {
        TransformerHandler handler = getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING);
        AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context);
        // Hacking the following line to remove Tika's inserted DocType
        this.html = new String(outputStream.toByteArray(), DEFAULT_ENCODING).replace("http://www.w3.org/1999/xhtml", "");
    } catch (Exception e) {
        logger.error("Error parsing file", e);
    }
}
Also used : TransformerHandler(javax.xml.transform.sax.TransformerHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ByteArrayOutputStream(java.io.ByteArrayOutputStream) TransformerConfigurationException(javax.xml.transform.TransformerConfigurationException)

Example 7 with Metadata

use of org.apache.tika.metadata.Metadata in project crawler4j by yasserg.

the class Parser method parse.

public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException {
    if (Util.hasBinaryContent(page.getContentType())) {
        // BINARY
        BinaryParseData parseData = new BinaryParseData();
        if (config.isIncludeBinaryContentInCrawling()) {
            if (config.isProcessBinaryContentInCrawling()) {
                parseData.setBinaryContent(page.getContentData());
            } else {
                parseData.setHtml("<html></html>");
            }
            page.setParseData(parseData);
            if (parseData.getHtml() == null) {
                throw new ParseException();
            }
            parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml()));
        } else {
            throw new NotAllowedContentException();
        }
    } else if (Util.hasPlainTextContent(page.getContentType())) {
        // plain Text
        try {
            TextParseData parseData = new TextParseData();
            if (page.getContentCharset() == null) {
                parseData.setTextContent(new String(page.getContentData()));
            } else {
                parseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
            }
            parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent()));
            page.setParseData(parseData);
        } catch (Exception e) {
            logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
            throw new ParseException();
        }
    } else {
        // isHTML
        Metadata metadata = new Metadata();
        HtmlContentHandler contentHandler = new HtmlContentHandler();
        try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) {
            htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
        } catch (Exception e) {
            logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
            throw new ParseException();
        }
        if (page.getContentCharset() == null) {
            page.setContentCharset(metadata.get("Content-Encoding"));
        }
        HtmlParseData parseData = new HtmlParseData();
        parseData.setText(contentHandler.getBodyText().trim());
        parseData.setTitle(metadata.get(DublinCore.TITLE));
        parseData.setMetaTags(contentHandler.getMetaTags());
        // Please note that identifying language takes less than 10 milliseconds
        LanguageIdentifier languageIdentifier = new LanguageIdentifier(parseData.getText());
        page.setLanguage(languageIdentifier.getLanguage());
        Set<WebURL> outgoingUrls = new HashSet<>();
        String baseURL = contentHandler.getBaseUrl();
        if (baseURL != null) {
            contextURL = baseURL;
        }
        int urlCount = 0;
        for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
            String href = urlAnchorPair.getHref();
            if ((href == null) || href.trim().isEmpty()) {
                continue;
            }
            String hrefLoweredCase = href.trim().toLowerCase();
            if (!hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("mailto:") && !hrefLoweredCase.contains("@")) {
                String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
                if (url != null) {
                    WebURL webURL = new WebURL();
                    webURL.setURL(url);
                    webURL.setTag(urlAnchorPair.getTag());
                    webURL.setAnchor(urlAnchorPair.getAnchor());
                    outgoingUrls.add(webURL);
                    urlCount++;
                    if (urlCount > config.getMaxOutgoingLinksToFollow()) {
                        break;
                    }
                }
            }
        }
        parseData.setOutgoingUrls(outgoingUrls);
        try {
            if (page.getContentCharset() == null) {
                parseData.setHtml(new String(page.getContentData()));
            } else {
                parseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
            }
            page.setParseData(parseData);
        } catch (UnsupportedEncodingException e) {
            logger.error("error parsing the html: " + page.getWebURL().getURL(), e);
            throw new ParseException();
        }
    }
}
Also used : Set(java.util.Set) HashSet(java.util.HashSet) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) WebURL(edu.uci.ics.crawler4j.url.WebURL) UnsupportedEncodingException(java.io.UnsupportedEncodingException) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) LanguageIdentifier(org.apache.tika.language.LanguageIdentifier) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException)

Example 8 with Metadata

use of org.apache.tika.metadata.Metadata in project crawler4j by yasserg.

the class HtmlContentHandlerTest method parseHtml.

private HtmlContentHandler parseHtml(String html) throws Exception {
    ByteArrayInputStream bais = new ByteArrayInputStream(html.getBytes());
    Metadata metadata = new Metadata();
    HtmlContentHandler contentHandler = new HtmlContentHandler();
    parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance());
    parser.parse(bais, contentHandler, metadata, parseContext);
    return contentHandler;
}
Also used : AllTagMapper(edu.uci.ics.crawler4j.parser.AllTagMapper) ByteArrayInputStream(java.io.ByteArrayInputStream) HtmlContentHandler(edu.uci.ics.crawler4j.parser.HtmlContentHandler) Metadata(org.apache.tika.metadata.Metadata)

Example 9 with Metadata

use of org.apache.tika.metadata.Metadata in project Asqatasun by Asqatasun.

the class UploadAuditSetUpFormValidator method validateFiles.

/**
     * Control whether the uploaded files are of HTML type and whether their
     * size is under the maxFileSize limit.
     *
     * @param uploadAuditSetUpCommand
     * @param errors
     */
private void validateFiles(AuditSetUpCommand uploadAuditSetUpCommand, Errors errors) {
    boolean emptyFile = true;
    Metadata metadata = new Metadata();
    MimeTypes mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository();
    String mime;
    for (int i = 0; i < uploadAuditSetUpCommand.getFileInputList().length; i++) {
        try {
            CommonsMultipartFile cmf = uploadAuditSetUpCommand.getFileInputList()[i];
            if (cmf.getSize() > maxFileSize) {
                Long maxFileSizeInMega = maxFileSize / 1000000;
                String[] arg = { maxFileSizeInMega.toString() };
                errors.rejectValue(ID_INPUT_FILE_PREFIX + "[" + i + "]", FILE_SIZE_EXCEEDED_MSG_BUNDLE_KEY, arg, "{0}");
            }
            if (cmf.getSize() > 0) {
                emptyFile = false;
                mime = mimeTypes.detect(new BufferedInputStream(cmf.getInputStream()), metadata).toString();
                LOGGER.debug("mime  " + mime + "  " + cmf.getOriginalFilename());
                if (!authorizedMimeType.contains(mime)) {
                    errors.rejectValue(ID_INPUT_FILE_PREFIX + "[" + i + "]", NOT_HTML_MSG_BUNDLE_KEY);
                }
            }
        } catch (IOException ex) {
            LOGGER.warn(ex);
            errors.rejectValue(ID_INPUT_FILE_PREFIX + "[" + i + "]", NOT_HTML_MSG_BUNDLE_KEY);
        }
    }
    if (emptyFile) {
        // if no file is uploaded
        LOGGER.debug("emptyFiles");
        errors.rejectValue(GENERAL_ERROR_MSG_KEY, NO_FILE_UPLOADED_MSG_BUNDLE_KEY);
    }
}
Also used : BufferedInputStream(java.io.BufferedInputStream) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) MimeTypes(org.apache.tika.mime.MimeTypes) CommonsMultipartFile(org.springframework.web.multipart.commons.CommonsMultipartFile)

Example 10 with Metadata

use of org.apache.tika.metadata.Metadata in project jackrabbit by apache.

the class LazyTextExtractorFieldTest method testEmptyParser.

/**
     * @see <a
     *      href="https://issues.apache.org/jira/browse/JCR-3296">JCR-3296</a>
     *      Indexing ignored file types creates some garbage
     */
public void testEmptyParser() throws Exception {
    InternalValue val = InternalValue.create(new RandomInputStream(1, 1024));
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "application/java-archive");
    metadata.set(Metadata.CONTENT_ENCODING, "UTF-8");
    Parser p = getSearchIndex().getParser();
    ParsingTask task = new ParsingTask(p, val, metadata, Integer.MAX_VALUE) {

        public void setExtractedText(String value) {
            assertEquals("", value);
        }
    };
    task.run();
}
Also used : ParsingTask(org.apache.jackrabbit.core.query.lucene.LazyTextExtractorField.ParsingTask) Metadata(org.apache.tika.metadata.Metadata) InternalValue(org.apache.jackrabbit.core.value.InternalValue) RandomInputStream(org.apache.jackrabbit.core.data.RandomInputStream) Parser(org.apache.tika.parser.Parser)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)643 Test (org.junit.Test)467 InputStream (java.io.InputStream)318 ParseContext (org.apache.tika.parser.ParseContext)281 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)268 TikaTest (org.apache.tika.TikaTest)257 ContentHandler (org.xml.sax.ContentHandler)228 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)151 ByteArrayInputStream (java.io.ByteArrayInputStream)141 Parser (org.apache.tika.parser.Parser)134 TikaInputStream (org.apache.tika.io.TikaInputStream)131 IOException (java.io.IOException)62 DefaultHandler (org.xml.sax.helpers.DefaultHandler)59 TikaException (org.apache.tika.exception.TikaException)46 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)36 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)36 StringWriter (java.io.StringWriter)33 Tika (org.apache.tika.Tika)28 FileInputStream (java.io.FileInputStream)27 MediaType (org.apache.tika.mime.MediaType)27