use of org.semanticdesktop.aperture.extractor.ExtractorException in project stanbol by apache.
the class HtmlTextExtractUtil method extract.
public void extract(URI id, String charset, InputStream input, RDFContainer result) throws ExtractorException {
String encoding = charset;
if (charset == null) {
try {
encoding = CharsetRecognizer.detect(input, "html", null);
} catch (IOException e) {
LOG.error("Charset detection problem: " + e.getMessage());
throw new ExtractorException("Charset detection problem: " + e.getMessage());
}
}
Document doc = htmlParser.getDOM(input, encoding);
htmlExtractor.extract(id.toString(), doc, null, result);
}
use of org.semanticdesktop.aperture.extractor.ExtractorException in project stanbol by apache.
the class SimpleMailExtractor method extractTextFromHtml.
protected String extractTextFromHtml(String string, String charset, RDFContainer rdf) throws ExtractorException {
// parse the HTML and extract full-text and metadata
HtmlTextExtractUtil extractor;
try {
extractor = new HtmlTextExtractUtil();
} catch (InitializationException e) {
throw new ExtractorException("Could not initialize HtmlExtractor: " + e.getMessage());
}
InputStream stream = new ByteArrayInputStream(string.getBytes());
RDFContainerFactory containerFactory = new RDFContainerFactoryImpl();
URI id = rdf.getDescribedUri();
RDFContainer result = containerFactory.getRDFContainer(id);
extractor.extract(id, charset, stream, result);
Model meta = result.getModel();
// append metadata and full-text to a string buffer
StringBuilder buffer = new StringBuilder(32 * 1024);
append(buffer, extractor.getTitle(meta), "\n");
append(buffer, extractor.getAuthor(meta), "\n");
append(buffer, extractor.getDescription(meta), "\n");
List<String> keywords = extractor.getKeywords(meta);
for (String kw : keywords) {
append(buffer, kw, " ");
}
buffer.append("\n");
append(buffer, extractor.getText(meta), " ");
logger.debug("text extracted:\n{}", buffer);
meta.close();
// return the buffer's content
return buffer.toString();
}
use of org.semanticdesktop.aperture.extractor.ExtractorException in project stanbol by apache.
the class MP3FileExtractor method performExtraction.
@Override
protected void performExtraction(URI arg0, File arg1, Charset arg2, String arg3, RDFContainer result) throws ExtractorException {
try {
Mp3File mp3File = new Mp3File(arg1.toString());
ID3v1 id3v1 = mp3File.getId3v1Tag();
ID3v2 id3v2 = mp3File.getId3v2Tag();
ID3Wrapper wrapper = new ID3Wrapper(id3v1, id3v2);
addId3Fields(wrapper, result);
result.add(RDF.type, NID3.ID3Audio);
} catch (UnsupportedTagException e) {
throw new ExtractorException(e);
} catch (InvalidDataException e) {
throw new ExtractorException(e);
} catch (IOException e) {
throw new ExtractorException(e);
}
}
use of org.semanticdesktop.aperture.extractor.ExtractorException in project stanbol by apache.
the class IksHtmlExtractor method extract.
public void extract(URI id, InputStream input, Charset charset, String mimeType, RDFContainer result) throws ExtractorException {
if (registry == null)
return;
String encoding;
if (charset == null) {
if (!input.markSupported()) {
input = new BufferedInputStream(input);
}
try {
encoding = CharsetRecognizer.detect(input, "html", "UTF-8");
} catch (IOException e) {
LOG.error("Charset detection problem: " + e.getMessage());
throw new ExtractorException("Charset detection problem: " + e.getMessage());
}
} else {
encoding = charset.name();
}
Document doc = htmlParser.getDOM(input, encoding);
/*
* This solves namespace problem but makes it difficult to handle normal
* HTML and namespaced XHTML documents on a par. Rather avoid namespaces
* in transformers for HTML elements! Problem remains that scripts then
* cannot be tested offline Way out might be to use disjunctions in
* scripts or ignore namespace by checking local-name() only
* (match=*[local-name() = 'xxx']) Are Microformats, RDFa, ... only used
* in XHTML? That would make the decision easier! Also have to solve the
* problem how to connect/map SemanticDesktop ontologies with those from
* the extractors String docText = DOMUtils.getStringFromDoc(doc,
* "UTF-8", null); logger.info(docText); doc = DOMUtils.parse(docText,
* "UTF-8");
*/
HashMap<String, HtmlExtractionComponent> extractors = registry.getRegistry();
List<String> formats = new ArrayList<String>();
long modelSize = result.getModel().size();
for (String s : registry.getActiveExtractors()) {
LOG.debug("Extractor: {}", s);
HtmlExtractionComponent extractor = extractors.get(s);
// formats used also in other formats
if (extractor != null) {
extractor.extract(id.toString(), doc, null, result);
long tmpSize = result.getModel().size();
if (modelSize < tmpSize) {
LOG.debug("{} Statements added: {}", (tmpSize - modelSize), s);
modelSize = tmpSize;
}
}
}
}
use of org.semanticdesktop.aperture.extractor.ExtractorException in project stanbol by apache.
the class SimpleMailExtractor method extract.
public void extract(URI id, InputStream stream, Charset charset, String mimeType, RDFContainer result) throws ExtractorException {
try {
// parse the stream
MimeMessage message = new MimeMessage(null, stream);
result.add(RDF.type, NMO.Email);
// extract the full-text
StringBuilder buffer = new StringBuilder(10000);
processMessage(message, buffer, result);
String text = buffer.toString().trim();
if (text.length() > 0) {
result.add(NMO.plainTextMessageContent, text);
result.add(NIE.plainTextContent, text);
}
// extract other metadata
String title = message.getSubject();
if (title != null) {
title = title.trim();
if (title.length() > 0) {
result.add(NMO.messageSubject, title);
}
}
try {
copyAddress(message.getFrom(), NMO.from, result);
} catch (AddressException e) {
// ignore
}
copyAddress(getRecipients(message, RecipientType.TO), NMO.to, result);
copyAddress(getRecipients(message, RecipientType.CC), NMO.cc, result);
copyAddress(getRecipients(message, RecipientType.BCC), NMO.bcc, result);
MailUtil.getDates(message, result);
} catch (MessagingException e) {
throw new ExtractorException(e);
} catch (IOException e) {
throw new ExtractorException(e);
}
}
Aggregations