use of org.apache.tika.language.LanguageIdentifier in project elasticsearch by elastic.
the class AttachmentProcessor method execute.
@Override
public void execute(IngestDocument ingestDocument) {
Map<String, Object> additionalFields = new HashMap<>();
byte[] input = ingestDocument.getFieldValueAsBytes(field, ignoreMissing);
if (input == null && ignoreMissing) {
return;
} else if (input == null) {
throw new IllegalArgumentException("field [" + field + "] is null, cannot parse.");
}
try {
Metadata metadata = new Metadata();
String parsedContent = TikaImpl.parse(input, metadata, indexedChars);
if (properties.contains(Property.CONTENT) && Strings.hasLength(parsedContent)) {
// somehow tika seems to append a newline at the end automatically, lets remove that again
additionalFields.put(Property.CONTENT.toLowerCase(), parsedContent.trim());
}
if (properties.contains(Property.LANGUAGE) && Strings.hasLength(parsedContent)) {
LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
String language = identifier.getLanguage();
additionalFields.put(Property.LANGUAGE.toLowerCase(), language);
}
if (properties.contains(Property.DATE)) {
String createdDate = metadata.get(TikaCoreProperties.CREATED);
if (createdDate != null) {
additionalFields.put(Property.DATE.toLowerCase(), createdDate);
}
}
if (properties.contains(Property.TITLE)) {
String title = metadata.get(TikaCoreProperties.TITLE);
if (Strings.hasLength(title)) {
additionalFields.put(Property.TITLE.toLowerCase(), title);
}
}
if (properties.contains(Property.AUTHOR)) {
String author = metadata.get("Author");
if (Strings.hasLength(author)) {
additionalFields.put(Property.AUTHOR.toLowerCase(), author);
}
}
if (properties.contains(Property.KEYWORDS)) {
String keywords = metadata.get("Keywords");
if (Strings.hasLength(keywords)) {
additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords);
}
}
if (properties.contains(Property.CONTENT_TYPE)) {
String contentType = metadata.get(Metadata.CONTENT_TYPE);
if (Strings.hasLength(contentType)) {
additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType);
}
}
if (properties.contains(Property.CONTENT_LENGTH)) {
String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
long length;
if (Strings.hasLength(contentLength)) {
length = Long.parseLong(contentLength);
} else {
length = parsedContent.length();
}
additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length);
}
} catch (Exception e) {
throw new ElasticsearchParseException("Error parsing document in field [{}]", e, field);
}
ingestDocument.setFieldValue(targetField, additionalFields);
}
use of org.apache.tika.language.LanguageIdentifier in project crawler4j by yasserg.
the class Parser method parse.
public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException {
if (Util.hasBinaryContent(page.getContentType())) {
// BINARY
BinaryParseData parseData = new BinaryParseData();
if (config.isIncludeBinaryContentInCrawling()) {
if (config.isProcessBinaryContentInCrawling()) {
parseData.setBinaryContent(page.getContentData());
} else {
parseData.setHtml("<html></html>");
}
page.setParseData(parseData);
if (parseData.getHtml() == null) {
throw new ParseException();
}
parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml()));
} else {
throw new NotAllowedContentException();
}
} else if (Util.hasPlainTextContent(page.getContentType())) {
// plain Text
try {
TextParseData parseData = new TextParseData();
if (page.getContentCharset() == null) {
parseData.setTextContent(new String(page.getContentData()));
} else {
parseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
}
parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent()));
page.setParseData(parseData);
} catch (Exception e) {
logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
throw new ParseException();
}
} else {
// isHTML
Metadata metadata = new Metadata();
HtmlContentHandler contentHandler = new HtmlContentHandler();
try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) {
htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
} catch (Exception e) {
logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
throw new ParseException();
}
if (page.getContentCharset() == null) {
page.setContentCharset(metadata.get("Content-Encoding"));
}
HtmlParseData parseData = new HtmlParseData();
parseData.setText(contentHandler.getBodyText().trim());
parseData.setTitle(metadata.get(DublinCore.TITLE));
parseData.setMetaTags(contentHandler.getMetaTags());
// Please note that identifying language takes less than 10 milliseconds
LanguageIdentifier languageIdentifier = new LanguageIdentifier(parseData.getText());
page.setLanguage(languageIdentifier.getLanguage());
Set<WebURL> outgoingUrls = new HashSet<>();
String baseURL = contentHandler.getBaseUrl();
if (baseURL != null) {
contextURL = baseURL;
}
int urlCount = 0;
for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
String href = urlAnchorPair.getHref();
if ((href == null) || href.trim().isEmpty()) {
continue;
}
String hrefLoweredCase = href.trim().toLowerCase();
if (!hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("mailto:") && !hrefLoweredCase.contains("@")) {
String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
if (url != null) {
WebURL webURL = new WebURL();
webURL.setURL(url);
webURL.setTag(urlAnchorPair.getTag());
webURL.setAnchor(urlAnchorPair.getAnchor());
outgoingUrls.add(webURL);
urlCount++;
if (urlCount > config.getMaxOutgoingLinksToFollow()) {
break;
}
}
}
}
parseData.setOutgoingUrls(outgoingUrls);
try {
if (page.getContentCharset() == null) {
parseData.setHtml(new String(page.getContentData()));
} else {
parseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
}
page.setParseData(parseData);
} catch (UnsupportedEncodingException e) {
logger.error("error parsing the html: " + page.getWebURL().getURL(), e);
throw new ParseException();
}
}
}
use of org.apache.tika.language.LanguageIdentifier in project stanbol by apache.
the class LangIdEnhancementEngine method computeEnhancements.
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text = "";
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
log.info("No text contained in ContentPart {} of ContentItem {}", contentPart.getKey(), ci.getUri());
return;
}
// truncate text to some piece from the middle if probeLength > 0
int checkLength = probeLength;
if (checkLength > 0 && text.length() > checkLength) {
text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
}
LanguageIdentifier languageIdentifier = new LanguageIdentifier(text);
String language = languageIdentifier.getLanguage();
log.info("language identified as " + language);
// add language to metadata
Graph g = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(language)));
g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.tika.language.LanguageIdentifier in project stanbol by apache.
the class LangIdEngineTest method testLangId.
/**
* Tests the language identification.
*
* @throws IOException if there is an error when reading the text
*/
@Test
public void testLangId() throws IOException {
LanguageIdentifier tc = new LanguageIdentifier(text);
String language = tc.getLanguage();
assertEquals("en", language);
}
use of org.apache.tika.language.LanguageIdentifier in project stanbol by apache.
the class LangIdTest method testLangId.
/**
* Tests the language identification.
*
* @throws IOException if there is an error when reading the text
*/
@Test
public void testLangId() throws IOException {
LanguageIdentifier tc = new LanguageIdentifier(text);
String language = tc.getLanguage();
assertEquals("en", language);
}
Aggregations