Search in sources :

Example 1 with LanguageIdentifier

use of org.apache.tika.language.LanguageIdentifier in project elasticsearch by elastic.

the class AttachmentProcessor method execute.

@Override
public void execute(IngestDocument ingestDocument) {
    Map<String, Object> additionalFields = new HashMap<>();
    byte[] input = ingestDocument.getFieldValueAsBytes(field, ignoreMissing);
    if (input == null && ignoreMissing) {
        return;
    } else if (input == null) {
        throw new IllegalArgumentException("field [" + field + "] is null, cannot parse.");
    }
    try {
        Metadata metadata = new Metadata();
        String parsedContent = TikaImpl.parse(input, metadata, indexedChars);
        if (properties.contains(Property.CONTENT) && Strings.hasLength(parsedContent)) {
            // somehow tika seems to append a newline at the end automatically, lets remove that again
            additionalFields.put(Property.CONTENT.toLowerCase(), parsedContent.trim());
        }
        if (properties.contains(Property.LANGUAGE) && Strings.hasLength(parsedContent)) {
            LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
            String language = identifier.getLanguage();
            additionalFields.put(Property.LANGUAGE.toLowerCase(), language);
        }
        if (properties.contains(Property.DATE)) {
            String createdDate = metadata.get(TikaCoreProperties.CREATED);
            if (createdDate != null) {
                additionalFields.put(Property.DATE.toLowerCase(), createdDate);
            }
        }
        if (properties.contains(Property.TITLE)) {
            String title = metadata.get(TikaCoreProperties.TITLE);
            if (Strings.hasLength(title)) {
                additionalFields.put(Property.TITLE.toLowerCase(), title);
            }
        }
        if (properties.contains(Property.AUTHOR)) {
            String author = metadata.get("Author");
            if (Strings.hasLength(author)) {
                additionalFields.put(Property.AUTHOR.toLowerCase(), author);
            }
        }
        if (properties.contains(Property.KEYWORDS)) {
            String keywords = metadata.get("Keywords");
            if (Strings.hasLength(keywords)) {
                additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords);
            }
        }
        if (properties.contains(Property.CONTENT_TYPE)) {
            String contentType = metadata.get(Metadata.CONTENT_TYPE);
            if (Strings.hasLength(contentType)) {
                additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType);
            }
        }
        if (properties.contains(Property.CONTENT_LENGTH)) {
            String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
            long length;
            if (Strings.hasLength(contentLength)) {
                length = Long.parseLong(contentLength);
            } else {
                length = parsedContent.length();
            }
            additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length);
        }
    } catch (Exception e) {
        throw new ElasticsearchParseException("Error parsing document in field [{}]", e, field);
    }
    ingestDocument.setFieldValue(targetField, additionalFields);
}
Also used : LanguageIdentifier(org.apache.tika.language.LanguageIdentifier) HashMap(java.util.HashMap) ElasticsearchParseException(org.elasticsearch.ElasticsearchParseException) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) ElasticsearchParseException(org.elasticsearch.ElasticsearchParseException) ConfigurationUtils.newConfigurationException(org.elasticsearch.ingest.ConfigurationUtils.newConfigurationException)

Example 2 with LanguageIdentifier

use of org.apache.tika.language.LanguageIdentifier in project crawler4j by yasserg.

the class Parser method parse.

public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException {
    if (Util.hasBinaryContent(page.getContentType())) {
        // BINARY
        BinaryParseData parseData = new BinaryParseData();
        if (config.isIncludeBinaryContentInCrawling()) {
            if (config.isProcessBinaryContentInCrawling()) {
                parseData.setBinaryContent(page.getContentData());
            } else {
                parseData.setHtml("<html></html>");
            }
            page.setParseData(parseData);
            if (parseData.getHtml() == null) {
                throw new ParseException();
            }
            parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml()));
        } else {
            throw new NotAllowedContentException();
        }
    } else if (Util.hasPlainTextContent(page.getContentType())) {
        // plain Text
        try {
            TextParseData parseData = new TextParseData();
            if (page.getContentCharset() == null) {
                parseData.setTextContent(new String(page.getContentData()));
            } else {
                parseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
            }
            parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent()));
            page.setParseData(parseData);
        } catch (Exception e) {
            logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
            throw new ParseException();
        }
    } else {
        // isHTML
        Metadata metadata = new Metadata();
        HtmlContentHandler contentHandler = new HtmlContentHandler();
        try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) {
            htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
        } catch (Exception e) {
            logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
            throw new ParseException();
        }
        if (page.getContentCharset() == null) {
            page.setContentCharset(metadata.get("Content-Encoding"));
        }
        HtmlParseData parseData = new HtmlParseData();
        parseData.setText(contentHandler.getBodyText().trim());
        parseData.setTitle(metadata.get(DublinCore.TITLE));
        parseData.setMetaTags(contentHandler.getMetaTags());
        // Please note that identifying language takes less than 10 milliseconds
        LanguageIdentifier languageIdentifier = new LanguageIdentifier(parseData.getText());
        page.setLanguage(languageIdentifier.getLanguage());
        Set<WebURL> outgoingUrls = new HashSet<>();
        String baseURL = contentHandler.getBaseUrl();
        if (baseURL != null) {
            contextURL = baseURL;
        }
        int urlCount = 0;
        for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
            String href = urlAnchorPair.getHref();
            if ((href == null) || href.trim().isEmpty()) {
                continue;
            }
            String hrefLoweredCase = href.trim().toLowerCase();
            if (!hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("mailto:") && !hrefLoweredCase.contains("@")) {
                String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
                if (url != null) {
                    WebURL webURL = new WebURL();
                    webURL.setURL(url);
                    webURL.setTag(urlAnchorPair.getTag());
                    webURL.setAnchor(urlAnchorPair.getAnchor());
                    outgoingUrls.add(webURL);
                    urlCount++;
                    if (urlCount > config.getMaxOutgoingLinksToFollow()) {
                        break;
                    }
                }
            }
        }
        parseData.setOutgoingUrls(outgoingUrls);
        try {
            if (page.getContentCharset() == null) {
                parseData.setHtml(new String(page.getContentData()));
            } else {
                parseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
            }
            page.setParseData(parseData);
        } catch (UnsupportedEncodingException e) {
            logger.error("error parsing the html: " + page.getWebURL().getURL(), e);
            throw new ParseException();
        }
    }
}
Also used : Set(java.util.Set) HashSet(java.util.HashSet) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) WebURL(edu.uci.ics.crawler4j.url.WebURL) UnsupportedEncodingException(java.io.UnsupportedEncodingException) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) LanguageIdentifier(org.apache.tika.language.LanguageIdentifier) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException)

Example 3 with LanguageIdentifier

use of org.apache.tika.language.LanguageIdentifier in project stanbol by apache.

the class LangIdEnhancementEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text = "";
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    if (text.trim().length() == 0) {
        log.info("No text contained in ContentPart {} of ContentItem {}", contentPart.getKey(), ci.getUri());
        return;
    }
    // truncate text to some piece from the middle if probeLength > 0
    int checkLength = probeLength;
    if (checkLength > 0 && text.length() > checkLength) {
        text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
    }
    LanguageIdentifier languageIdentifier = new LanguageIdentifier(text);
    String language = languageIdentifier.getLanguage();
    log.info("language identified as " + language);
    // add language to metadata
    Graph g = ci.getMetadata();
    ci.getLock().writeLock().lock();
    try {
        IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
        g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(language)));
        g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) LanguageIdentifier(org.apache.tika.language.LanguageIdentifier) Graph(org.apache.clerezza.commons.rdf.Graph) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) IOException(java.io.IOException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 4 with LanguageIdentifier

use of org.apache.tika.language.LanguageIdentifier in project stanbol by apache.

the class LangIdEngineTest method testLangId.

/**
     * Tests the language identification.
     *
     * @throws IOException if there is an error when reading the text
     */
@Test
public void testLangId() throws IOException {
    LanguageIdentifier tc = new LanguageIdentifier(text);
    String language = tc.getLanguage();
    assertEquals("en", language);
}
Also used : LanguageIdentifier(org.apache.tika.language.LanguageIdentifier) Test(org.junit.Test)

Example 5 with LanguageIdentifier

use of org.apache.tika.language.LanguageIdentifier in project stanbol by apache.

the class LangIdTest method testLangId.

/**
     * Tests the language identification.
     *
     * @throws IOException if there is an error when reading the text
     */
@Test
public void testLangId() throws IOException {
    LanguageIdentifier tc = new LanguageIdentifier(text);
    String language = tc.getLanguage();
    assertEquals("en", language);
}
Also used : LanguageIdentifier(org.apache.tika.language.LanguageIdentifier) Test(org.junit.Test)

Aggregations

LanguageIdentifier (org.apache.tika.language.LanguageIdentifier)6 IOException (java.io.IOException)2 Metadata (org.apache.tika.metadata.Metadata)2 Test (org.junit.Test)2 ParseException (edu.uci.ics.crawler4j.crawler.exceptions.ParseException)1 WebURL (edu.uci.ics.crawler4j.url.WebURL)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 InputStream (java.io.InputStream)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 Set (java.util.Set)1 Graph (org.apache.clerezza.commons.rdf.Graph)1 IRI (org.apache.clerezza.commons.rdf.IRI)1 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)1 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)1 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)1 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)1 ElasticsearchParseException (org.elasticsearch.ElasticsearchParseException)1