use of org.apache.tika.metadata.Metadata in project crawler4j by yasserg.
the class BinaryParseData method setBinaryContent.
public void setBinaryContent(byte[] data) {
InputStream inputStream = new ByteArrayInputStream(data);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
try {
TransformerHandler handler = getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING);
AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context);
// Hacking the following line to remove Tika's inserted DocType
this.html = new String(outputStream.toByteArray(), DEFAULT_ENCODING).replace("http://www.w3.org/1999/xhtml", "");
} catch (Exception e) {
logger.error("Error parsing file", e);
}
}
use of org.apache.tika.metadata.Metadata in project crawler4j by yasserg.
the class Parser method parse.
public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException {
if (Util.hasBinaryContent(page.getContentType())) {
// BINARY
BinaryParseData parseData = new BinaryParseData();
if (config.isIncludeBinaryContentInCrawling()) {
if (config.isProcessBinaryContentInCrawling()) {
parseData.setBinaryContent(page.getContentData());
} else {
parseData.setHtml("<html></html>");
}
page.setParseData(parseData);
if (parseData.getHtml() == null) {
throw new ParseException();
}
parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml()));
} else {
throw new NotAllowedContentException();
}
} else if (Util.hasPlainTextContent(page.getContentType())) {
// plain Text
try {
TextParseData parseData = new TextParseData();
if (page.getContentCharset() == null) {
parseData.setTextContent(new String(page.getContentData()));
} else {
parseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
}
parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent()));
page.setParseData(parseData);
} catch (Exception e) {
logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
throw new ParseException();
}
} else {
// isHTML
Metadata metadata = new Metadata();
HtmlContentHandler contentHandler = new HtmlContentHandler();
try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) {
htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
} catch (Exception e) {
logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
throw new ParseException();
}
if (page.getContentCharset() == null) {
page.setContentCharset(metadata.get("Content-Encoding"));
}
HtmlParseData parseData = new HtmlParseData();
parseData.setText(contentHandler.getBodyText().trim());
parseData.setTitle(metadata.get(DublinCore.TITLE));
parseData.setMetaTags(contentHandler.getMetaTags());
// Please note that identifying language takes less than 10 milliseconds
LanguageIdentifier languageIdentifier = new LanguageIdentifier(parseData.getText());
page.setLanguage(languageIdentifier.getLanguage());
Set<WebURL> outgoingUrls = new HashSet<>();
String baseURL = contentHandler.getBaseUrl();
if (baseURL != null) {
contextURL = baseURL;
}
int urlCount = 0;
for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
String href = urlAnchorPair.getHref();
if ((href == null) || href.trim().isEmpty()) {
continue;
}
String hrefLoweredCase = href.trim().toLowerCase();
if (!hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("mailto:") && !hrefLoweredCase.contains("@")) {
String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
if (url != null) {
WebURL webURL = new WebURL();
webURL.setURL(url);
webURL.setTag(urlAnchorPair.getTag());
webURL.setAnchor(urlAnchorPair.getAnchor());
outgoingUrls.add(webURL);
urlCount++;
if (urlCount > config.getMaxOutgoingLinksToFollow()) {
break;
}
}
}
}
parseData.setOutgoingUrls(outgoingUrls);
try {
if (page.getContentCharset() == null) {
parseData.setHtml(new String(page.getContentData()));
} else {
parseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
}
page.setParseData(parseData);
} catch (UnsupportedEncodingException e) {
logger.error("error parsing the html: " + page.getWebURL().getURL(), e);
throw new ParseException();
}
}
}
use of org.apache.tika.metadata.Metadata in project crawler4j by yasserg.
the class HtmlContentHandlerTest method parseHtml.
private HtmlContentHandler parseHtml(String html) throws Exception {
ByteArrayInputStream bais = new ByteArrayInputStream(html.getBytes());
Metadata metadata = new Metadata();
HtmlContentHandler contentHandler = new HtmlContentHandler();
parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance());
parser.parse(bais, contentHandler, metadata, parseContext);
return contentHandler;
}
use of org.apache.tika.metadata.Metadata in project Asqatasun by Asqatasun.
the class UploadAuditSetUpFormValidator method validateFiles.
/**
* Control whether the uploaded files are of HTML type and whether their
* size is under the maxFileSize limit.
*
* @param uploadAuditSetUpCommand
* @param errors
*/
private void validateFiles(AuditSetUpCommand uploadAuditSetUpCommand, Errors errors) {
boolean emptyFile = true;
Metadata metadata = new Metadata();
MimeTypes mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository();
String mime;
for (int i = 0; i < uploadAuditSetUpCommand.getFileInputList().length; i++) {
try {
CommonsMultipartFile cmf = uploadAuditSetUpCommand.getFileInputList()[i];
if (cmf.getSize() > maxFileSize) {
Long maxFileSizeInMega = maxFileSize / 1000000;
String[] arg = { maxFileSizeInMega.toString() };
errors.rejectValue(ID_INPUT_FILE_PREFIX + "[" + i + "]", FILE_SIZE_EXCEEDED_MSG_BUNDLE_KEY, arg, "{0}");
}
if (cmf.getSize() > 0) {
emptyFile = false;
mime = mimeTypes.detect(new BufferedInputStream(cmf.getInputStream()), metadata).toString();
LOGGER.debug("mime " + mime + " " + cmf.getOriginalFilename());
if (!authorizedMimeType.contains(mime)) {
errors.rejectValue(ID_INPUT_FILE_PREFIX + "[" + i + "]", NOT_HTML_MSG_BUNDLE_KEY);
}
}
} catch (IOException ex) {
LOGGER.warn(ex);
errors.rejectValue(ID_INPUT_FILE_PREFIX + "[" + i + "]", NOT_HTML_MSG_BUNDLE_KEY);
}
}
if (emptyFile) {
// if no file is uploaded
LOGGER.debug("emptyFiles");
errors.rejectValue(GENERAL_ERROR_MSG_KEY, NO_FILE_UPLOADED_MSG_BUNDLE_KEY);
}
}
use of org.apache.tika.metadata.Metadata in project jackrabbit by apache.
the class LazyTextExtractorFieldTest method testEmptyParser.
/**
* @see <a
* href="https://issues.apache.org/jira/browse/JCR-3296">JCR-3296</a>
* Indexing ignored file types creates some garbage
*/
public void testEmptyParser() throws Exception {
InternalValue val = InternalValue.create(new RandomInputStream(1, 1024));
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "application/java-archive");
metadata.set(Metadata.CONTENT_ENCODING, "UTF-8");
Parser p = getSearchIndex().getParser();
ParsingTask task = new ParsingTask(p, val, metadata, Integer.MAX_VALUE) {
public void setExtractedText(String value) {
assertEquals("", value);
}
};
task.run();
}
Aggregations