use of org.ontoware.rdf2go.model.Model in project stanbol by apache.
the class SimpleMailExtractor method main.
public static void main(String[] args) throws Exception {
int argv = 0;
SimpleMailExtractor extractor = new SimpleMailExtractor();
RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
for (int i = argv; i < args.length; ++i) {
File file = new File(args[i]);
InputStream in = new FileInputStream(file);
URI uri = new URIImpl(file.toURI().toString());
RDFContainer rdfContainer = rdfFactory.getRDFContainer(uri);
extractor.extract(uri, in, null, null, rdfContainer);
Model model = rdfContainer.getModel();
model.writeTo(System.out, Syntax.RdfXml);
model.close();
}
}
use of org.ontoware.rdf2go.model.Model in project stanbol by apache.
the class SimpleMailExtractor method processContent.
// the recursive part
protected void processContent(Object content, StringBuilder buffer, RDFContainer rdf) throws MessagingException, IOException, ExtractorException {
if (content instanceof String) {
buffer.append(content);
buffer.append(' ');
} else if (content instanceof BodyPart) {
BodyPart bodyPart = (BodyPart) content;
DataHandler handler = bodyPart.getDataHandler();
String encoding = null;
if (handler != null) {
encoding = MimeUtility.getEncoding(handler);
}
String fileName = bodyPart.getFileName();
String contentType = bodyPart.getContentType();
if (fileName != null) {
try {
fileName = MimeUtility.decodeWord(fileName);
} catch (MessagingException e) {
// happens on unencoded file names! so just ignore it and leave the file name as it is
}
URI attachURI = URIGenerator.createNewRandomUniqueURI();
rdf.add(NMO.hasAttachment, attachURI);
Model m = rdf.getModel();
m.addStatement(attachURI, RDF.type, NFO.Attachment);
m.addStatement(attachURI, NFO.fileName, fileName);
if (handler != null) {
if (encoding != null) {
m.addStatement(attachURI, NFO.encoding, encoding);
}
}
if (contentType != null) {
contentType = (new ContentType(contentType)).getBaseType();
m.addStatement(attachURI, NIE.mimeType, contentType.trim());
}
// TODO: encoding?
}
// append the content, if any
content = bodyPart.getContent();
// remove any html markup if necessary
if (contentType != null && content instanceof String) {
contentType = contentType.toLowerCase();
if (contentType.indexOf("text/html") >= 0) {
if (encoding != null) {
encoding = MimeUtility.javaCharset(encoding);
}
content = extractTextFromHtml((String) content, encoding, rdf);
}
}
processContent(content, buffer, rdf);
} else if (content instanceof Multipart) {
Multipart multipart = (Multipart) content;
String subType = null;
String contentType = multipart.getContentType();
if (contentType != null) {
ContentType ct = new ContentType(contentType);
subType = ct.getSubType();
if (subType != null) {
subType = subType.trim().toLowerCase();
}
}
if ("alternative".equals(subType)) {
handleAlternativePart(multipart, buffer, rdf);
} else if ("signed".equals(subType)) {
handleProtectedPart(multipart, 0, buffer, rdf);
} else if ("encrypted".equals(subType)) {
handleProtectedPart(multipart, 1, buffer, rdf);
} else {
// handles multipart/mixed, /digest, /related, /parallel, /report and unknown subtypes
handleMixedPart(multipart, buffer, rdf);
}
}
}
use of org.ontoware.rdf2go.model.Model in project stanbol by apache.
the class SimpleMailExtractor method extractTextFromHtml.
protected String extractTextFromHtml(String string, String charset, RDFContainer rdf) throws ExtractorException {
// parse the HTML and extract full-text and metadata
HtmlTextExtractUtil extractor;
try {
extractor = new HtmlTextExtractUtil();
} catch (InitializationException e) {
throw new ExtractorException("Could not initialize HtmlExtractor: " + e.getMessage());
}
InputStream stream = new ByteArrayInputStream(string.getBytes());
RDFContainerFactory containerFactory = new RDFContainerFactoryImpl();
URI id = rdf.getDescribedUri();
RDFContainer result = containerFactory.getRDFContainer(id);
extractor.extract(id, charset, stream, result);
Model meta = result.getModel();
// append metadata and full-text to a string buffer
StringBuilder buffer = new StringBuilder(32 * 1024);
append(buffer, extractor.getTitle(meta), "\n");
append(buffer, extractor.getAuthor(meta), "\n");
append(buffer, extractor.getDescription(meta), "\n");
List<String> keywords = extractor.getKeywords(meta);
for (String kw : keywords) {
append(buffer, kw, " ");
}
buffer.append("\n");
append(buffer, extractor.getText(meta), " ");
logger.debug("text extracted:\n{}", buffer);
meta.close();
// return the buffer's content
return buffer.toString();
}
use of org.ontoware.rdf2go.model.Model in project stanbol by apache.
the class TestMetaxaCore method testRdfaExtraction.
/**
* This tests the html extraction.
*
* @throws ExtractorException if there is an error during extraction
* @throws IOException if there is an error when reading the document
*/
@Test
public void testRdfaExtraction() throws Exception {
String testFile = "test-rdfa.html";
String testResultFile = "rdfa-res.txt";
// extract text from RDFa annotated html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
String text = MetaxaCore.getText(m);
// get expected result
InputStream in2 = getResourceAsStream(testResultFile);
assertNotNull("failed to load resource " + testResultFile, in2);
String expectedText = IOUtils.toString(in2, "utf-8");
// test
assertEquals(cleanup(expectedText), cleanup(text));
// show triples
int tripleCounter = this.printTriples(m);
assertEquals(10, tripleCounter);
}
use of org.ontoware.rdf2go.model.Model in project stanbol by apache.
the class RDF2GoUtils method urifyBlankNodes.
public static void urifyBlankNodes(Model model) {
HashMap<BlankNode, URI> nodeMap = new HashMap<BlankNode, URI>();
Model add = RDF2Go.getModelFactory().createModel();
add.open();
Model remove = RDF2Go.getModelFactory().createModel();
remove.open();
for (Statement stmt : model) {
RDFTerm subj = stmt.getSubject();
URI pred = stmt.getPredicate();
Node obj = stmt.getObject();
boolean match = false;
if (subj instanceof BlankNode) {
match = true;
URI newSubj = nodeMap.get(subj);
if (newSubj == null) {
newSubj = URIGenerator.createNewRandomUniqueURI();
nodeMap.put(subj.asBlankNode(), newSubj);
}
subj = newSubj;
}
if (obj instanceof BlankNode) {
match = true;
URI newObj = nodeMap.get(obj);
if (newObj == null) {
newObj = URIGenerator.createNewRandomUniqueURI();
nodeMap.put(obj.asBlankNode(), newObj);
}
obj = newObj;
}
if (match) {
remove.addStatement(stmt);
add.addStatement(subj, pred, obj);
}
}
ClosableIterator<Statement> addIt = add.iterator();
ClosableIterator<Statement> removeIt = remove.iterator();
model.update(new DiffImpl(addIt, removeIt));
addIt.close();
removeIt.close();
add.close();
remove.close();
}
Aggregations